In [None]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('data/x_train.csv')
df2 = pd.read_csv('data/y_train.csv')
x_train = df.copy()
y_train = df2.copy()

filter1 = pd.read_json('filter.json')

In [None]:
# x_train.iloc[:, 14:].head() # Take the filtered features instead
filter1_titles = filter1["title"].tolist()
x_train_f1 = x_train[filter1_titles] # doesn't work
x_train_f1.head()

## Correlation

In [None]:
fig = plt.figure(figsize=(10, 10))
sns.heatmap(x_train_f1.corr(), fmt='.2f')

# 2nd filter makes a set with the features that are not correlated with each other

In [None]:
features = x_train_f1.columns
rows, cols = [int(len(features)**0.5)]*2
fig, axes = plt.subplots(rows, cols, figsize=(10, 10), tight_layout=True)
count = 0
for i in range(rows):
    for j in range(cols):
        axes[i, j].scatter(x_train_f1.iloc[:, count], y_train)
        axes[i, j].set_title(features[count])
        count += 1

In [None]:
fig = plt.figure(figsize=(5, 5))
sns.histplot(x_train_f1.iloc[:, 0])
plt.xlim(0, 1000)
plt.ylim(0, 20000)

## Train the model

In [None]:
from implementations import least_squares, ridge_regression

# All rows contain NaN values => Dropping them all would result in an empty dataset
# x_train_c = x_train_f1.dropna() # x_train cleaned by dropping all rows containing NaN value(s)
# Instead, let's replace NaN values by the mean of the column
x_train_c = x_train_f1.fillna(x_train_f1.mean())
print(x_train_c.values)

weights, mse = least_squares(y_train.values, x_train_c.values)
print("Weights: " + str(weights) + "\nMSE: " + str(mse))

## Make predictions

In [None]:
# Now, we can use the weights to predict which columns correlate the most with y_train

y_pred = x_train_c.values.dot(weights)
print("y_pred: " + str(y_pred))

# Let's transform the predictions with values from 0 to 1

y_pred_norm = (y_pred - y_pred.min()) / (y_pred.max() - y_pred.min())

# If the value is above 0.5, we consider it to be 1, otherwise 0

y_pred_norm[y_pred_norm > 0.5] = 1
y_pred_norm[y_pred_norm <= 0.5] = 0

# Let's store the predictions in a submission_file.csv in CSV format without index_label

submission_file = pd.DataFrame(y_pred_norm, columns=["_MICHD"])
submission_file.to_csv('submission_file.csv', index=False)



