In [38]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from sklearn.model_selection import train_test_split
import plotly.express as px

In [28]:
df = pd.read_csv('./spendingscores.csv', sep=',')
df = pd.get_dummies(df, columns=['Gender'], drop_first=True)

In [8]:
train, test = train_test_split(df, test_size=0.2)

train_x = train.loc[:, df.columns != 'SpendingScore']
train_y = train.loc[:, df.columns == 'SpendingScore']

In [10]:
trained_model = sm.OLS(train_y, train_x).fit()

In [18]:
print('P values')
print(trained_model.pvalues)
print('------')
print('Standard devs')
print(np.sqrt(trained_model.mse_resid))
print('------')
print('Results with coefficients')
print(trained_model.params)

P values
Age             7.458593e-04
AnnualIncome    5.027585e-07
Gender_Male     2.581791e-01
dtype: float64
------
Standard devs
31.83869875406372
------
Results with coefficients
Age             0.428323
AnnualIncome    0.412473
Gender_Male     5.723968
dtype: float64


In [40]:
print('Correlation matrix')
fig = px.imshow(df.corr())
fig.show()

Correlation matrix


In [48]:
df_without_gender = df.drop('Gender_Male', axis=1)

In [49]:
train, test = train_test_split(df_without_gender, test_size=0.2)
train_x = train.loc[:, df_without_gender.columns != 'SpendingScore']
train_y = train.loc[:, df_without_gender.columns == 'SpendingScore']
trained_model_without_gender = sm.OLS(train_y, train_x).fit()

In [50]:
print('P values')
print(trained_model_without_gender.pvalues)
print('------')
print('Standard devs')
print(np.sqrt(trained_model_without_gender.mse_resid))
print('------')
print('Results with coefficients')
print(trained_model_without_gender.params)

P values
Age             3.137835e-04
AnnualIncome    5.698575e-09
dtype: float64
------
Standard devs
30.50409166283086
------
Results with coefficients
Age             0.428335
AnnualIncome    0.461432
dtype: float64


In [51]:
print('Correlation matrix')
fig = px.imshow(df_without_gender.corr())
fig.show()

Correlation matrix
