#
IMPORT STATEMENTS

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#
DATA READING AND ANALYSIS

In [None]:
data = pd.read_csv("dataset/Admission_Predict.csv")

In [None]:
data.drop(["Serial No."], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.isnull().sum() 

#
VISUALIZATION

In [None]:
plt.scatter(data['GRE Score'],data['CGPA'])
plt.title('CGPA vs GRE Score')
plt.xlabel('GRE Score')
plt.ylabel('CGPA')
plt.show()

In [None]:
plt.scatter(data['CGPA'],data['SOP'])
plt.title('SOP for CGPA')
plt.xlabel('CGPA')
plt.ylabel('SOP')
plt.show()

In [None]:
data[data.CGPA >= 8.5].plot(kind='scatter', x='GRE Score', y='TOEFL Score',color="BLUE")

plt.xlabel("GRE Score")
plt.ylabel("TOEFL SCORE")
plt.title("CGPA>=8.5")
plt.grid(True)

plt.show()

In [None]:
data["GRE Score"].plot(kind = 'hist',bins = 200,figsize = (6,6))

plt.title("GRE Scores")
plt.xlabel("GRE Score")
plt.ylabel("Frequency")

plt.show()

In [None]:
p = np.array([data["TOEFL Score"].min(),data["TOEFL Score"].mean(),data["TOEFL Score"].max()])
r = ["Worst","Average","Best"]
plt.bar(p,r)

plt.title("TOEFL Scores")
plt.xlabel("Level")
plt.ylabel("TOEFL Score")

plt.show()

In [None]:
g = np.array([data["GRE Score"].min(),data["GRE Score"].mean(),data["GRE Score"].max()])
h = ["Worst","Average","Best"]
plt.bar(g,h)

plt.title("GRE Scores")
plt.xlabel("Level")
plt.ylabel("GRE Score")

plt.show()

In [None]:
plt.figure(figsize=(10, 10))

sns.heatmap(data.corr(), annot=True, linewidths=0.05, fmt= '.2f',cmap="magma")

plt.show()

In [None]:
data.Research.value_counts()

sns.countplot(x="University Rating",data=data)

In [None]:
sns.barplot(x="University Rating", y="Chance of Admit ", data=data)

#
TRAIN-TEST SPLIT

In [None]:
X=data.drop(['Chance of Admit '],axis=1) #input data_set
y=data['Chance of Admit '] #output labels

In [None]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

#
MODELING AND TRAINING

In [None]:
from sklearn.ensemble import RandomForestRegressor
rgr=RandomForestRegressor(random_state=1)
rgr.fit(X_train,y_train)

In [None]:
rgr.score(X_test,y_test)

In [None]:
y_predict=rgr.predict(X_test)
y_predict

In [None]:
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import numpy as np
print('Mean Absolute Error:', mean_absolute_error(y_test, y_predict))  
print('Mean Squared Error:', mean_squared_error(y_test, y_predict))  
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_predict)))

In [None]:
y_train = (y_train>0.5)
y_test = (y_test>0.5)

In [None]:
from sklearn.linear_model._logistic import LogisticRegression

lore = LogisticRegression(random_state=0, max_iter=1000)

lr = lore.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, confusion_matrix

print('Accuracy Score:', accuracy_score(y_test, y_pred))  
print('Recall Score:', recall_score(y_test, y_pred))  
print('ROC AUC Score:', roc_auc_score(y_test, y_pred))
print('Confussion Matrix:\n', confusion_matrix(y_test, y_pred))


#
SAVING THE MODELS

In [None]:
import pickle

In [None]:
pickle.dump(lr, open("university.pkl", "wb")) #logistic regression model
pickle.dump(rgr, open("university_percent.pkl", "wb")) #random forest regression model