In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt

In [2]:
pizza = pd.read_csv("pizza.csv")
pizza.head()

Unnamed: 0,Promote,Sales
0,23,554
1,56,1339
2,34,815
3,25,609
4,67,1600


In [3]:
lr = LinearRegression()

In [4]:
x = pizza[['Promote']]
y = pizza['Sales']

In [5]:
poly = PolynomialFeatures(degree=2)
x_poly = poly.fit_transform(x)
print(poly.get_feature_names_out())

['1' 'Promote' 'Promote^2']


In [6]:
lr.fit(x_poly,y)
print(lr.intercept_)
print(lr.coef_)

-18.27491741665085
[ 0.00000000e+00  2.48006959e+01 -1.30947333e-02]


In [7]:
y_pred = lr.predict(x_poly)

In [8]:
print(r2_score(y,y_pred))

0.9889789184026645


In [9]:
bos = pd.read_csv("Boston.csv")
bos.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [10]:
x = bos.iloc[:,:-1]
y = bos['medv']
poly = PolynomialFeatures(degree=3)
x_poly = poly.fit_transform(x)
#print(poly.get_feature_names_out())
lr.fit(x_poly,y)
#print(lr.intercept_)
#print(lr.coef_)
y_pred = lr.predict(x_poly)
print(r2_score(y,y_pred))

0.9979685756335139


In [11]:
# Train Test Split
# Linear Regression

In [12]:
from sklearn.model_selection import train_test_split

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.7,random_state=2022)

In [25]:
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
print(r2_score(y_test,y_pred))

0.7430693569118816


In [26]:
# Polynomial

In [33]:
poly = PolynomialFeatures(degree=1)
poly.fit(x_train)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

In [34]:
lr = LinearRegression()
lr.fit(x_train_poly, y_train)
y_pred = lr.predict(x_test_poly)
print(r2_score(y_test,y_pred))

0.7430693569118818


# Pipeline

In [36]:
from sklearn.pipeline import Pipeline

In [37]:
poly = PolynomialFeatures(degree=1)
lr = LinearRegression()
pipe = Pipeline([('Polynomial',poly),('LIN',lr)])
pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
print(r2_score(y_test,y_pred))

0.7430693569118818


In [39]:
wed = pd.read_excel("Weddings.xlsx",usecols='A:F',skiprows=2)
wed.head()

Unnamed: 0,Couple's Income,Bride's age,Payor,Wedding cost,Attendance,Value Rating
0,130000,22,Bride's Parents,60700,300,3
1,157000,23,Bride's Parents,52000,350,1
2,98000,27,Bride & Groom,47000,150,3
3,72000,29,Bride & Groom,42000,200,5
4,86000,25,Bride's Parents,34000,250,3


In [50]:
# case 1

# x = wed[["Wedding cost"]]
# y = wed['Attendance']
# x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.7,random_state=2022)

# lr = LinearRegression()
# lr.fit(x_train, y_train)
# y_pred = lr.predict(x_test)
# print(r2_score(y_test,y_pred))

x = wed[["Wedding cost"]]
y = wed['Attendance']
lr = LinearRegression()
lr.fit(x,y)
#print(lr.intercept_)
#print(lr.coef_)
y_pred = lr.predict(x)
print(r2_score(y,y_pred))

0.5377184992361959


In [51]:
# case 1

x = wed[["Value Rating"]]
y = wed['Attendance']
lr = LinearRegression()
lr.fit(x,y)
#print(lr.intercept_)
#print(lr.coef_)
y_pred = lr.predict(x)
print(r2_score(y,y_pred))

0.006579643742817498


In [52]:
# case 2

x = wed[["Couple's Income"]]
y = wed['Attendance']
lr = LinearRegression()
lr.fit(x,y)
#print(lr.intercept_)
#print(lr.coef_)
y_pred = lr.predict(x)
print(r2_score(y,y_pred))

0.4897348449194999


In [55]:
# case 3

x = wed[["Couple's Income"]]
y = wed['Wedding cost']
lr = LinearRegression()
lr.fit(x,y)
#print(lr.intercept_)
#print(lr.coef_)
y_pred = lr.predict(x)
print(r2_score(y,y_pred))

0.6887947810328077


## Anova

In [63]:
import statsmodels.api as sm

In [64]:
pizza = pd.read_csv("pizza.csv")
pizza.head()

Unnamed: 0,Promote,Sales
0,23,554
1,56,1339
2,34,815
3,25,609
4,67,1600


In [65]:
x = pizza['Promote']
x = sm.add_constant(x)
y = pizza['Sales']
model = sm.OLS(y,x)
results = model.fit()
print(results.params)
print(results.summary())

const       5.485865
Promote    23.506403
dtype: float64
                            OLS Regression Results                            
Dep. Variable:                  Sales   R-squared:                       0.989
Model:                            OLS   Adj. R-squared:                  0.988
Method:                 Least Squares   F-statistic:                     1503.
Date:                Fri, 16 Dec 2022   Prob (F-statistic):           4.97e-18
Time:                        11:45:18   Log-Likelihood:                -105.50
No. Observations:                  19   AIC:                             215.0
Df Residuals:                      17   BIC:                             216.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------



In [57]:
credit = pd.read_excel("CreditApprovalDecisions.xlsx",usecols='A:F',skiprows=2)
credit.head()

Unnamed: 0,Homeowner,Credit Score,Years of Credit History,Revolving Balance,Revolving Utilization,Decision
0,Y,725,20,11320,0.25,Approve
1,Y,573,9,7200,0.7,Reject
2,Y,677,11,20000,0.55,Approve
3,N,625,15,12800,0.65,Reject
4,N,527,12,5700,0.75,Reject


In [58]:
credit.shape

(50, 6)

In [66]:
x = credit[['Years of Credit History','Revolving Balance','Revolving Utilization']]
y = credit['Credit Score']
x = sm.add_constant(x)
model = sm.OLS(y,x)
results = model.fit()
print(results.params)
print(results.summary())

const                      771.892254
Years of Credit History     -2.098481
Revolving Balance            0.001565
Revolving Utilization     -246.227645
dtype: float64
                            OLS Regression Results                            
Dep. Variable:           Credit Score   R-squared:                       0.665
Model:                            OLS   Adj. R-squared:                  0.643
Method:                 Least Squares   F-statistic:                     30.39
Date:                Fri, 16 Dec 2022   Prob (F-statistic):           5.53e-11
Time:                        11:45:46   Log-Likelihood:                -268.13
No. Observations:                  50   AIC:                             544.3
Df Residuals:                      46   BIC:                             551.9
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                              coef    std e