In [7]:
# Unzipping the dataset
import zipfile
import os

zip_path = '/content/universityPrediction.zip'
extraction_path = '/content/data/universityPrediction'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)

# Listing the extracted files to verify
extracted_files = os.listdir(extraction_path)
print(extracted_files)


['original_data.csv', 'score.csv']


In [43]:
import pandas as pd

original_data_path = '/content/data/universityPrediction/original_data.csv'
score_data_path = '/content/data/universityPrediction/score.csv'

# Loading the datasets
original_df = pd.read_csv(original_data_path)
score_df = pd.read_csv(score_data_path)

# Displaying the first few rows to understand the structure
print("Original Dataset:")
print(original_df.head())

print("\nGRE Score Conversion Dataset:")
print(score_df.head())


Original Dataset:
      userName                       major  researchExp  industryExp  \
0       143saf         Systems and Control            0           18   
1   7790ashish   Manufacturing Engineering            0            0   
2         AB25  (MIS / MSIM / MSIS / MSIT)            0           66   
3     abhijitg                         NaN            0            0   
4  abhijitgang                         MIS            0            0   

  specialization  toeflScore program                 department toeflEssay  \
0       Robotics       112.0      MS  Instrumentation & Control         26   
1            NaN         NaN      MS                          0        NaN   
2            NaN        94.0      MS       Computer Engineering         21   
3            NaN         NaN     NaN                          0        NaN   
4            NaN        81.0      MS                   computer        NaN   

   internExp  ...  termAndYear  confPubs                    ugCollege gmatA  \
0

In [44]:
import pandas as pd

# Loading the student profiles dataset
profiles_path = os.path.join(extraction_path, 'original_data.csv')  # Update if the filename is different
profiles_df = pd.read_csv(profiles_path)

print(profiles_df.head())
print(profiles_df.info())


      userName                       major  researchExp  industryExp  \
0       143saf         Systems and Control            0           18   
1   7790ashish   Manufacturing Engineering            0            0   
2         AB25  (MIS / MSIM / MSIS / MSIT)            0           66   
3     abhijitg                         NaN            0            0   
4  abhijitgang                         MIS            0            0   

  specialization  toeflScore program                 department toeflEssay  \
0       Robotics       112.0      MS  Instrumentation & Control         26   
1            NaN         NaN      MS                          0        NaN   
2            NaN        94.0      MS       Computer Engineering         21   
3            NaN         NaN     NaN                          0        NaN   
4            NaN        81.0      MS                   computer        NaN   

   internExp  ...  termAndYear  confPubs                    ugCollege gmatA  \
0        5.0  ...  

In [45]:
# Displaying column names in the original dataset
print("Column names in Original Dataset:")
print(original_df.columns.tolist())


Column names in Original Dataset:
['userName', 'major', 'researchExp', 'industryExp', 'specialization', 'toeflScore', 'program', 'department', 'toeflEssay', 'internExp', 'greV', 'greQ', 'userProfileLink', 'journalPubs', 'greA', 'topperCgpa', 'termAndYear', 'confPubs', 'ugCollege', 'gmatA', 'cgpa', 'gmatQ', 'cgpaScale', 'gmatV', 'univName', 'admit']


In [68]:
from sklearn.impute import SimpleImputer
import numpy as np

# Numerical imputation
num_imputer = SimpleImputer(strategy='mean')  # or 'median'

numerical_features = ['greQ', 'greV', 'greA', 'toeflScore', 'cgpa']

original_df[numerical_features] = num_imputer.fit_transform(original_df[numerical_features])

# Categorical imputation
cat_imputer = SimpleImputer(strategy='most_frequent')  # or use strategy='constant', fill_value='unknown'

categorical_features = ['major', 'program']

original_df[categorical_features] = cat_imputer.fit_transform(original_df[categorical_features])

print(original_df.isnull().sum())


userName               0
major                  0
researchExp            0
industryExp            0
specialization     21695
toeflScore             0
program                0
department             1
toeflEssay         41770
internExp              0
greV                   0
greQ                   0
userProfileLink        0
journalPubs          322
greA                   0
topperCgpa             3
termAndYear          322
confPubs             322
ugCollege           2278
gmatA              53525
cgpa                   0
gmatQ              53521
cgpaScale              0
gmatV              53530
univName               0
admit                  0
dtype: int64


In [69]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

features = ['greQ', 'greV', 'greA', 'toeflScore', 'cgpa']
target = 'admit'

X = original_df[features]
y = original_df[target].astype('int')

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Initializing and training the classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predictions and evaluation
predictions = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")



Accuracy: 0.5664554012489514


In [70]:
student_profile = {'greQ': 360, 'greV': 355, 'greA': 4.0, 'toeflScore': 117, 'cgpa': 3.8}

# Converting to DataFrame
new_student_df = pd.DataFrame([student_profile])

# Predicting the admission probability
admission_probability = clf.predict_proba(new_student_df)[0]

print(f"Admission Probability: {admission_probability}")


Admission Probability: [0.28445238 0.71554762]
