In [85]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor  
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [86]:
df = pd.read_csv("feeData.csv")
df.describe()

Unnamed: 0,Year,Value
count,3548.0,3548.0
mean,2016.923337,13027.720124
std,2.55391,8734.568645
min,2013.0,1225.0
25%,2015.0,7756.75
50%,2017.0,10203.5
75%,2019.0,14830.75
max,2021.0,49152.0


In [87]:
df.columns

Index(['Year', 'State', 'Type', 'Length', 'Expense', 'Value'], dtype='object')

In [88]:
df = df.dropna()

In [89]:
df.head()

Unnamed: 0,Year,State,Type,Length,Expense,Value
0,2013,Alabama,Private,4-year,Fees/Tuition,13983
1,2013,Alabama,Private,4-year,Room/Board,8503
2,2013,Alabama,Public In-State,2-year,Fees/Tuition,4048
3,2013,Alabama,Public In-State,4-year,Fees/Tuition,8073
4,2013,Alabama,Public In-State,4-year,Room/Board,8473


In [90]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot(df):
    # Extract input and target features
    X = df[['Year', 'State', 'Type', 'Length', 'Expense']]
    y = df['Value']

    # Convert categorical features to numerical
    X = pd.get_dummies(X, columns=['State', 'Type'])

    # Plot pairwise relationships between input features and target feature
    sns.pairplot(pd.concat([X, y], axis=1), x_vars=X.columns, y_vars=['Value'])

    plt.show()


In [91]:
X = df.drop(columns=["Value"])
y = df["Value"]

In [92]:
X.describe()



Unnamed: 0,Year
count,3548.0
mean,2016.923337
std,2.55391
min,2013.0
25%,2015.0
50%,2017.0
75%,2019.0
max,2021.0


In [93]:
categorical = X.select_dtypes(include=["object"])
categorical.columns

Index(['State', 'Type', 'Length', 'Expense'], dtype='object')

In [94]:
oneHencoder = OneHotEncoder()

X_en = oneHencoder.fit_transform(categorical)
X_en = pd.DataFrame(X_en.toarray())
X_en.columns = X_en.columns.astype(str)
X_en.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [95]:
X_num =X.select_dtypes(include=['number']) 
X = pd.concat([X_en, X_num], axis=1)
X.shape


(3548, 59)

In [96]:
y.describe()

count     3548.000000
mean     13027.720124
std       8734.568645
min       1225.000000
25%       7756.750000
50%      10203.500000
75%      14830.750000
max      49152.000000
Name: Value, dtype: float64

In [97]:
#plot(df)

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
y_train.describe()

count     2483.000000
mean     13057.833266
std       8752.470680
min       1225.000000
25%       7739.000000
50%      10286.000000
75%      14960.000000
max      49152.000000
Name: Value, dtype: float64

In [99]:
model = LinearRegression()

In [100]:
X_train.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24',
       '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36',
       '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48',
       '49', '50', '51', '52', '53', '54', '55', '56', '57', 'Year'],
      dtype='object')

In [101]:
model = model.fit(X_train, y_train)

In [102]:
def format_dollars(amount):
    return f"${amount:.2f}"

In [103]:
from sklearn.metrics import mean_squared_error
predictions = model.predict(X_test)
predictions
acc_score = mean_squared_error(y_test, predictions)
val = r2_score(y_test, predictions)
print(acc_score, val)


23649223.46948357 0.6869977014736848
