In [None]:
# Mutual Information

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-darkgrid')

df = pd.read_csv('data.csv')
df.head()

In [None]:
# 让sklearn知道哪些特征是离散的，哪些是连续的，所以我们需要将这些离散特征进行标记以分类
X = df.copy()
y = X.pop('target')

# Label encoding for categoricals
for colname in X.select_dtypes('object'):
    X[colname], _ = X[colname].factorize()
    
# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = X.dtypes == int

In [None]:
from sklearn.feature_selection import mutual_info_regression
# 计算每个特征的互信息分数函数
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, y, discrete_features)
mi_scores[::3]  # show a few features with their MI scores

In [None]:
# 画出互信息分数条形图
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores)

In [None]:
sns.relplot(x="curb_weight", y="price", data=df)

""" fuel_type功能的 MI 分数相当低，但从图中可以看出，它清楚地区分了马力功能中具有不同趋势的两个价格群体。
这表明fuel_type有助于交互效应，并且可能并非不重要。
在从MI分数确定一个功能不重要之前，最好先调查一下任何可能的交互效应 - 领域知识可以在这里提供很多指导。 """
sns.lmplot(x="horsepower", y="price", hue="fuel_type", data=df)

In [None]:
# 从这个单元格开始是execise
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)


# Load data
df = pd.read_csv("../input/fe-course-data/ames.csv")


# Utility functions from Tutorial
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [None]:
# 粗略判断一下哪个特征与房价相关性更高
features = ["YearBuilt", "MoSold", "ScreenPorch"]
# 其中id_vars是要保留的列，value_vars是要转换为值的列。
# facet_kws参数是一个字典，用于传递给relplot()函数的facet_kws参数。facet_kws参数用于控制子图的外观和布局。在这种情况下，dict(sharex=False)用于指定子图之间不共享x轴。
sns.relplot(
    x="value", y="SalePrice", col="variable", data=df.melt(id_vars="SalePrice", value_vars=features), facet_kws=dict(sharex=False),
)

In [None]:
# 展示前二十个特征的互信息分数
X = df.copy()
y = X.pop('SalePrice')

mi_scores = make_mi_scores(X, y)
print(mi_scores.head(20))
# print(mi_scores.tail(20))  # uncomment to see bottom 20

plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores.head(20))
# plot_mi_scores(mi_scores.tail(20))  # uncomment to see bottom 20

In [None]:
# 通过箱线图可以看到，建筑类型与房价之间的关系，可以看出没什么明显的关系
sns.catplot(x="BldgType", y="SalePrice", data=df, kind="boxen")

In [None]:
# 用来来看看BldgType是否与其他特征产生互动
feature = "GrLivArea"

sns.lmplot(
    x=feature, y="SalePrice", hue="BldgType", col="BldgType",
    data=df, scatter_kws={"edgecolor": 'w'}, col_wrap=3, height=4,
)