In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from xgboost import XGBRegressor
import random
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error


# Data import

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


# EDA

In [None]:
test.info(memory_usage="deep")

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
# Checking if there are missing values in the datasets
train.isna().sum().sum(), test.isna().sum().sum()

In [None]:
fig, ax = plt.subplots(figsize=(7, 7))
pie = ax.pie([len(train), len(test)],
             labels=["Train dataset", "Test dataset"],
             colors=["yellow", "green"],
             textprops={"fontsize": 15},
             autopct='%1.1f%%')
ax.axis("equal")
ax.set_title("Dataset length comparison", fontsize=16)
fig.set_facecolor('white')
plt.show();

In [None]:
# Checking target distribution

fig, ax = plt.subplots(figsize=(16, 8))

bars = ax.hist(train["target"],
               bins=100,
               color="palevioletred",
               edgecolor="black")
ax.set_title("Target distribution", fontsize=20, pad=15)
ax.set_ylabel("Amount of values", fontsize=14, labelpad=15)
ax.set_xlabel("Target value", fontsize=14, labelpad=10)
ax.margins(0.025, 0.12)
ax.grid(axis="y")

plt.show();

In [None]:
# Lists of categorical and numerical feature columns
cat_features = ["cat" + str(i) for i in range(10)]
num_features = ["cont" + str(i) for i in range(14)]

In [None]:
# Combined dataframe containing numerical features only
df = pd.concat([train[num_features], test[num_features]], axis=0)
columns = df.columns.values

# Calculating required amount of rows to display all feature plots
cols = 3
rows = len(columns) // cols + 1

fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,20), sharex=False)

# Adding some distance between plots
plt.subplots_adjust(hspace = 0.3)

# Plots counter
i=0
for r in np.arange(0, rows, 1):
    for c in np.arange(0, cols, 1):
        if i >= len(columns): # If there is no more data columns to make plots from
            axs[r, c].set_visible(False) # Hiding axes so there will be clean background
        else:
            # Train data histogram
            hist1 = axs[r, c].hist(train[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="deepskyblue",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Train Dataset")
            # Test data histogram
            hist2 = axs[r, c].hist(test[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="palevioletred",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Test Dataset")
            axs[r, c].set_title(columns[i], fontsize=14, pad=5)
            axs[r, c].tick_params(axis="y", labelsize=13)
            axs[r, c].tick_params(axis="x", labelsize=13)
            axs[r, c].grid(axis="y")
            axs[r, c].legend(fontsize=13)
                                  
        i+=1
# plt.suptitle("Numerical feature values distribution in both datasets", y=0.99)
plt.show();

In [None]:
#Check if the datasets have different amount of categories in categorical features.
bars_pos = np.arange(len(cat_features))

width=0.3
fig, ax = plt.subplots(figsize=(14, 6))
# Making two bar objects. One is on the left from bar position and the other one is on the right
bars1 = ax.bar(bars_pos-width/2,
               train[cat_features].nunique().values,
               width=width,
               color="darkorange", edgecolor="black")
bars2 = ax.bar(bars_pos+width/2,
               test[cat_features].nunique().values,
               width=width,
               color="steelblue", edgecolor="black")
ax.set_title("Amount of values in categorical features", fontsize=20, pad=15)
ax.set_xlabel("Categorical feature", fontsize=15, labelpad=15)
ax.set_ylabel("Amount of values", fontsize=15, labelpad=15)
ax.set_xticks(bars_pos)
ax.set_xticklabels(cat_features, fontsize=12)
ax.tick_params(axis="y", labelsize=12)
ax.grid(axis="y")
plt.margins(0.01, 0.05)

In [None]:
#Correlation map
df = train.drop("id", axis=1)

# Encoding categorical features with OrdinalEncoder
for col in cat_features:
    encoder = OrdinalEncoder()
    df[col] = encoder.fit_transform(np.array(df[col]).reshape(-1, 1))

# Calculatin correlation values
df = df.corr().round(2)

# Mask to hide upper-right part of plot as it is a duplicate
mask = np.zeros_like(df)
mask[np.triu_indices_from(mask)] = True

# Making a plot
plt.figure(figsize=(14,14))
ax = sns.heatmap(df, annot=True, mask=mask, cmap="RdBu", annot_kws={"weight": "normal", "fontsize":9})
ax.set_title("Feature correlation heatmap", fontsize=17)
plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
         rotation_mode="anchor", weight="normal")
plt.setp(ax.get_yticklabels(), weight="normal",
         rotation_mode="anchor", rotation=0, ha="right")
plt.show();