# Chance of Admission for Higher Studies
#### Predict the chances of admission of a student to a Graduate program based on:

1. GRE Scores (290 to 340)
2. TOEFL Scores (92 to 120)
3. University Rating (1 to 5)
4. Statement of Purpose (1 to 5)
5. Letter of Recommendation Strength (1 to 5)
6. Undergraduate CGPA (6.8 to 9.92)
7. Research Experience (0 or 1)
8. Chance of Admit (0.34 to 0.97)

In [0]:
dbutils.fs.ls("dbfs:/FileStore/tables/")

[FileInfo(path='dbfs:/FileStore/tables/Admission_Chance.csv', name='Admission_Chance.csv', size=12905, modificationTime=1720190058000),
 FileInfo(path='dbfs:/FileStore/tables/Cancer.csv', name='Cancer.csv', size=125204, modificationTime=1720190099000),
 FileInfo(path='dbfs:/FileStore/tables/Credit_Default.csv', name='Credit_Default.csv', size=101152, modificationTime=1720190106000),
 FileInfo(path='dbfs:/FileStore/tables/Customer_Purchase.csv', name='Customer_Purchase.csv', size=1489, modificationTime=1720190113000),
 FileInfo(path='dbfs:/FileStore/tables/Fish.csv', name='Fish.csv', size=6349, modificationTime=1720190119000),
 FileInfo(path='dbfs:/FileStore/tables/Ice_Cream.csv', name='Ice_Cream.csv', size=4872, modificationTime=1720190124000),
 FileInfo(path='dbfs:/FileStore/tables/Test1.csv', name='Test1.csv', size=108, modificationTime=1720158698000),
 FileInfo(path='dbfs:/FileStore/tables/Test2.csv', name='Test2.csv', size=192, modificationTime=1720158698000),
 FileInfo(path='dbfs:

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, abs
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
spark = SparkSession.builder.appName('Chance of Admission for Higher Studies').getOrCreate() 

In [0]:
spark

In [0]:

df_pyspark = spark.read.csv('dbfs:/FileStore/tables/Admission_Chance.csv',header=True,inferSchema=True)

In [0]:
df_pyspark.printSchema()

root
 |-- Serial No: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |--  SOP: double (nullable = true)
 |-- LOR : double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit : double (nullable = true)



In [0]:
df_pyspark

DataFrame[Serial No: int, GRE Score: int, TOEFL Score: int, University Rating: int,  SOP: double, LOR : double, CGPA: double, Research: int, Chance of Admit : double]

In [0]:
df_pyspark.show()

+---------+---------+-----------+-----------------+----+----+----+--------+----------------+
|Serial No|GRE Score|TOEFL Score|University Rating| SOP|LOR |CGPA|Research|Chance of Admit |
+---------+---------+-----------+-----------------+----+----+----+--------+----------------+
|        1|      337|        118|                4| 4.5| 4.5|9.65|       1|            0.92|
|        2|      324|        107|                4| 4.0| 4.5|8.87|       1|            0.76|
|        3|      316|        104|                3| 3.0| 3.5| 8.0|       1|            0.72|
|        4|      322|        110|                3| 3.5| 2.5|8.67|       1|             0.8|
|        5|      314|        103|                2| 2.0| 3.0|8.21|       0|            0.65|
|        6|      330|        115|                5| 4.5| 3.0|9.34|       1|             0.9|
|        7|      321|        109|                3| 3.0| 4.0| 8.2|       1|            0.75|
|        8|      308|        101|                2| 3.0| 4.0| 7.9|    

# 1. Clean the DataFrame

In [0]:
# Clean column names by trimming spaces
for col_name in df_pyspark.columns:
    df_pyspark = df_pyspark.withColumnRenamed(col_name, col_name.strip().replace(' ', '_'))

# Show schema after cleaning
df_pyspark.printSchema()

root
 |-- Serial_No: integer (nullable = true)
 |-- GRE_Score: integer (nullable = true)
 |-- TOEFL_Score: integer (nullable = true)
 |-- University_Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance_of_Admit: double (nullable = true)



In [0]:
# Handle missing values if necessary
df_pyspark = df_pyspark.na.drop()

# Prepare the DataFrame

In [0]:
# Define the feature columns
feature_columns = ["GRE_Score", "TOEFL_Score", "University_Rating", "SOP", "LOR", "CGPA", "Research"]

# Assemble features into a single vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_pyspark = assembler.transform(df_pyspark)

# Select only the features and target column
df_pyspark = df_pyspark.select("features", "Chance_of_Admit")

In [0]:
df_pyspark.show()

+--------------------+---------------+
|            features|Chance_of_Admit|
+--------------------+---------------+
|[337.0,118.0,4.0,...|           0.92|
|[324.0,107.0,4.0,...|           0.76|
|[316.0,104.0,3.0,...|           0.72|
|[322.0,110.0,3.0,...|            0.8|
|[314.0,103.0,2.0,...|           0.65|
|[330.0,115.0,5.0,...|            0.9|
|[321.0,109.0,3.0,...|           0.75|
|[308.0,101.0,2.0,...|           0.68|
|[302.0,102.0,1.0,...|            0.5|
|[323.0,108.0,3.0,...|           0.45|
|[325.0,106.0,3.0,...|           0.52|
|[327.0,111.0,4.0,...|           0.84|
|[328.0,112.0,4.0,...|           0.78|
|[307.0,109.0,3.0,...|           0.62|
|[311.0,104.0,3.0,...|           0.61|
|[314.0,105.0,3.0,...|           0.54|
|[317.0,107.0,3.0,...|           0.66|
|[319.0,106.0,3.0,...|           0.65|
|[318.0,110.0,3.0,...|           0.63|
|[303.0,102.0,3.0,...|           0.62|
+--------------------+---------------+
only showing top 20 rows



# 3. Split the DataFrame

In [0]:
# Split the data into training and testing sets
train_data, test_data = df_pyspark.randomSplit([0.8, 0.2], seed=42)

# 4. Train the Model

In [0]:
# Initialize the linear regression model
lr = LinearRegression(featuresCol="features", labelCol="Chance_of_Admit")

# Fit the model on the training data
lr_model = lr.fit(train_data)

# 5. Evaluate the Model

In [0]:
# Make predictions on the test data
predictions = lr_model.transform(test_data)

# Evaluate the model using RMSE
rmse_evaluator = RegressionEvaluator(labelCol="Chance_of_Admit", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

# Evaluate the model using MAE
mae_evaluator = RegressionEvaluator(labelCol="Chance_of_Admit", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)

# Evaluate the model using MSE
mse_evaluator = RegressionEvaluator(labelCol="Chance_of_Admit", predictionCol="prediction", metricName="mse")
mse = mse_evaluator.evaluate(predictions)

# Calculate MAPE (Mean Absolute Percentage Error)
predictions = predictions.withColumn("absolute_error", abs(col("prediction") - col("Chance_of_Admit")))
predictions = predictions.withColumn("percentage_error", col("absolute_error") / col("Chance_of_Admit"))
mape = predictions.selectExpr("mean(percentage_error) as MAPE").collect()[0]["MAPE"] * 100

# Print the coefficients and intercept for linear regression
print(f"Coefficients: {lr_model.coefficients}")
print(f"Intercept: {lr_model.intercept}")

# Print the evaluation metrics
print(f"Root Mean Squared Error (RMSE) on test data: {rmse}")
print(f"Mean Absolute Error (MAE) on test data: {mae}")
print(f"Mean Squared Error (MSE) on test data: {mse}")
print(f"Mean Absolute Percentage Error (MAPE) on test data: {mape}%")




Coefficients: [0.0021441557278328955,0.0029448787202701206,0.010158762766901787,-0.007798086707145683,0.021356248341924733,0.11698856317086154,0.020279525006144286]
Intercept: -1.3677594308099525
Root Mean Squared Error (RMSE) on test data: 0.07615891582124568
Mean Absolute Error (MAE) on test data: 0.05460223519961452
Mean Squared Error (MSE) on test data: 0.0058001804590675846
Mean Absolute Percentage Error (MAPE) on test data: 8.858109656537529%


In [0]:
# Show some sample predictions
predictions.select("prediction", "Chance_of_Admit", "features").show(5)

+------------------+---------------+--------------------+
|        prediction|Chance_of_Admit|            features|
+------------------+---------------+--------------------+
|0.5690680965373631|           0.64|[293.0,97.0,2.0,2...|
|0.4575038118777415|           0.47|[295.0,96.0,2.0,1...|
|0.5252641716207944|           0.69|[295.0,101.0,2.0,...|
| 0.517547871552895|            0.6|[296.0,101.0,1.0,...|
| 0.519501855786225|           0.44|[298.0,98.0,2.0,1...|
+------------------+---------------+--------------------+
only showing top 5 rows



In [0]:
# Calculate Confusion Matrix:

from pyspark.sql.functions import expr

predictions.groupBy("Chance_of_Admit", "prediction").count().show()


+---------------+-------------------+-----+
|Chance_of_Admit|         prediction|count|
+---------------+-------------------+-----+
|           0.84| 0.8055232646273054|    1|
|           0.84| 0.8324665476939876|    1|
|           0.89| 0.8556019589233999|    1|
|           0.84| 0.7300319411626703|    1|
|           0.42| 0.5780706595995149|    1|
|           0.68| 0.6427334663227757|    1|
|           0.92| 0.8923168272132993|    1|
|           0.69| 0.6345901719948519|    1|
|           0.63|  0.579325347266817|    1|
|           0.94| 0.9523583592136564|    1|
|           0.95| 0.9397956184170275|    1|
|           0.54| 0.5394947255114388|    1|
|            0.6|  0.517547871552895|    1|
|           0.52| 0.7556021156482222|    1|
|           0.67| 0.5799590930399572|    1|
|           0.36|0.41230903474397484|    1|
|           0.91| 0.8577255286149286|    1|
|           0.74| 0.8073395046877359|    1|
|            0.7| 0.7779591460733304|    1|
|           0.75| 0.748441064615

In [0]:
# Save the trained linear regression model
model_path = "./Internship_Sem-6_models/Chance_of_addmission_model"
lr_model.save(model_path)

In [0]:
dbutils.fs.ls("dbfs:/Internship_Sem-6_models/Chance_of_addmission_model")

[FileInfo(path='dbfs:/Internship_Sem-6_models/Chance_of_addmission_model/data/', name='data/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/Internship_Sem-6_models/Chance_of_addmission_model/metadata/', name='metadata/', size=0, modificationTime=0)]

In [0]:
%sh
ls /tmp

In [0]:
dbutils.fs.cp("dbfs:/Chance_of_addmission_model", "file:/tmp/Chance_of_addmission_model",recurse=True)


In [0]:
%sh
ls /tmp/Chance_of_addmission_model

In [0]:
%sh

zip -r /tmp/Chance_of_addmission_model.zip /tmp/Chance_of_addmission_model


In [0]:
dbutils.fs.cp("file:/tmp/Chance_of_addmission_model.zip","dbfs:/FileStore/Chance_of_addmission_model.zip")


In [0]:
dbutils.fs.ls("dbfs:/FileStore")
#dbutils.fs.rm("dbfs:/Chance_of_addmission_model/",recurse=True)