In [2]:
#importing libraries
import scipy.stats as stats
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [43]:
#reading data as data frame
df = pd.read_csv('spendingscores.csv')
df.head()

Unnamed: 0,Gender,Age,AnnualIncome,SpendingScore
0,Male,19,15,39
1,Male,21,15,81
2,Female,20,16,6
3,Female,23,16,77
4,Female,31,17,40


In [4]:
#buidling and fitting model
import statsmodels.formula.api as smf
df.columns = ['Gender', 'Age', 'AnnualIncome', 'SpendingScore']
model = smf.ols(formula='SpendingScore ~ Gender + Age + AnnualIncome', data=df).fit()

In [5]:
#showing model 
model.summary()

0,1,2,3
Dep. Variable:,SpendingScore,R-squared:,0.109
Model:,OLS,Adj. R-squared:,0.095
Method:,Least Squares,F-statistic:,7.96
Date:,"Wed, 18 Jan 2023",Prob (F-statistic):,4.91e-05
Time:,19:24:27,Log-Likelihood:,-922.05
No. Observations:,200,AIC:,1852.0
Df Residuals:,196,BIC:,1865.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,73.9300,6.642,11.130,0.000,60.831,87.029
Gender[T.Male],-2.0132,3.512,-0.573,0.567,-8.939,4.913
Age,-0.6004,0.125,-4.806,0.000,-0.847,-0.354
AnnualIncome,0.0079,0.066,0.119,0.905,-0.123,0.139

0,1,2,3
Omnibus:,11.059,Durbin-Watson:,3.448
Prob(Omnibus):,0.004,Jarque-Bera (JB):,6.033
Skew:,-0.233,Prob(JB):,0.049
Kurtosis:,2.288,Cond. No.,291.0


In [6]:
#showing p-values
model.pvalues

Intercept         1.287752e-22
Gender[T.Male]    5.671172e-01
Age               3.055177e-06
AnnualIncome      9.050939e-01
dtype: float64

In [7]:
#showing coefficients
print('Model coefficients are: ', model.params)

Model coefficients are:  Intercept         73.930034
Gender[T.Male]    -2.013234
Age               -0.600371
AnnualIncome       0.007929
dtype: float64


In [8]:
#showing standard deviations
print('Model stds are:', model.bse)

Model stds are: Intercept         6.642253
Gender[T.Male]    3.511825
Age               0.124916
AnnualIncome      0.066420
dtype: float64


In [9]:
#showing data correlations
import plotly.express as px

fig = px.scatter_matrix(df)
fig

In [44]:
#dropping the least important variable based on above correlation matrix
df2 = df.drop(columns='Gender')
df2

Unnamed: 0,Age,AnnualIncome,SpendingScore
0,19,15,39
1,21,15,81
2,20,16,6
3,23,16,77
4,31,17,40
...,...,...,...
195,35,120,79
196,45,126,28
197,32,126,74
198,32,137,18


In [45]:
#fitting model on new data
model = smf.ols(formula='SpendingScore ~ Age + AnnualIncome', data=df2).fit()


In [46]:
#model summary for new data (without the least important variable)
model.summary()

0,1,2,3
Dep. Variable:,SpendingScore,R-squared:,0.107
Model:,OLS,Adj. R-squared:,0.098
Method:,Least Squares,F-statistic:,11.82
Date:,"Wed, 18 Jan 2023",Prob (F-statistic):,1.42e-05
Time:,19:36:43,Log-Likelihood:,-922.21
No. Observations:,200,AIC:,1850.0
Df Residuals:,197,BIC:,1860.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,73.3479,6.553,11.193,0.000,60.425,86.271
Age,-0.6048,0.124,-4.859,0.000,-0.850,-0.359
AnnualIncome,0.0057,0.066,0.087,0.931,-0.125,0.136

0,1,2,3
Omnibus:,10.159,Durbin-Watson:,3.452
Prob(Omnibus):,0.006,Jarque-Bera (JB):,6.026
Skew:,-0.254,Prob(JB):,0.0491
Kurtosis:,2.318,Cond. No.,286.0
