In [None]:
import csv
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

file_name= "full-data.csv"

lr = LinearRegression()
ite = 0

# Plots the scatter plot and regression line
def plotgraph(X,y, name, pred):
    name = name.capitalize()
    title = "Fig 1.3 Linear Regression for " + name + " vs Sleep"
    plt.figure()
    plt.scatter(X,y, color='#495867',  label = "Actual")
    plt.plot(X, pred, color='#FE5F55', label = "Best fit")
    #plot formattig
    plt.xlabel(name)
    plt.style.use('seaborn')
    plt.grid(False)
    plt.title(title)
    plt.ylabel('Sleep')
    plt.legend()
    plt.savefig("./graphs/" + name + ".png", dpi = 300, bbox_inches='tight')

# Fits the data to the model and returns r2 score and calls plotgraph
def fitdata(X,y, name):
    samples = X.shape[0]
    dof = X.shape[1]
    lr.fit(X, y)
    c = lr.intercept_
    m = lr.coef_[0]
    r2 = r2_score(y, lr.predict(X))
    plotgraph(X,y, name, lr.predict(X))
    return [c, m, r2]


df = pd.read_csv(file_name)
cat_variables = ["gender", "ethnicity", "major"]

# prepare categorical variables for regression
for st in cat_variables:
    df[st] = df[st].astype("category")
    df[st] = df[st].cat.codes

# check if dataframe contains any null values
if df.isnull().sum().sum() != 0:
    raise Exception("Data contains null values. Can't proceed with linear regression")

# seperate the independent variables (X) and the dependent variable (y)
X = df.drop(columns=["sleep"])
y = df["sleep"]

# writes all the results to a csv file
# stores all the r2 scores, gradients and intercept for each independent variable
with open('readings.csv', 'w', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(["Column name", "Intercept", "Coefficients", "R2"])
    for column in X.columns:
        row= [column] + fitdata(X[column].values.reshape(-1,1), y, column)
        writer.writerow(row)
    fitdata(X,y, "All")
