In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

In [3]:
df1 = pd.read_csv("Admission_Predict_Ver1.1.csv").iloc[:,1:]
df2 = pd.read_csv("Admission_Predict.csv").iloc[:,1:]
df = pd.concat([df1, df2]).drop_duplicates()
df = df.rename(columns={"LOR " : "LOR", "Chance of Admit ": "Chance of Admit"})
df

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.00,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.80
4,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
495,332,108,5,4.5,4.0,9.02,1,0.87
496,337,117,5,5.0,5.0,9.87,1,0.96
497,330,120,5,4.5,5.0,9.56,1,0.93
498,312,103,4,4.0,5.0,8.43,0,0.73


In [11]:
def build_pipeline(df):
    categorical_features = ["Research"]
    numeric_features = [col for col in df.columns if col not in categorical_features + ["Chance of Admit"]]

    numeric_transformer = Pipeline(
        steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
    )
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )

def build_model(model_obj, df):
    preprocessor = build_pipeline(df)
    model = Pipeline(steps=[("preprocessor", preprocessor), ("model", model_obj)])
    return model


In [32]:
models = {
    
    "DecisionTreeRegressor": DecisionTreeRegressor(), 
    "LinearRegression": LinearRegression(), "RandomForestRegressor": RandomForestRegressor(), 
    "GradientBoostingRegressor": GradientBoostingRegressor(), "SVR": SVR(), "MLPRegressor": MLPRegressor(),
    }

In [20]:
# X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.iloc[:,-1], test_size=0.2, random_state=42)
p = 0.8


df = df.sample(frac=1).reset_index(drop=True)

X_train, X_test, y_train, y_test = df.iloc[:int(len(df)*p),:-1], df.iloc[int(len(df)*p):,:-1], df.iloc[:int(len(df)*p),-1], df.iloc[int(len(df)*p):,-1]



In [33]:
for m in models:
    model = build_model(models[m], df)
    model.fit(X_train, y_train)
    print(m, model.score(X_test, y_test))

DecisionTreeRegressor 0.6239391796322489
LinearRegression 0.8603946309332895
RandomForestRegressor 0.8333809347006134
GradientBoostingRegressor 0.8190572521077175
SVR 0.6670989566032179
MLPRegressor -1.1255116570246368
