In [1]:
# Basic Spark Session Initialization
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Depression_Prediction_Project") \
    .getOrCreate()

spark

25/03/19 17:27:49 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
# Path to your dataset
data_path = '/home/linuxu/big data analysis final project/Student_Depression_Dataset.csv'

# Load dataset into Spark DataFrame
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Display schema clearly to ensure correctness
df.printSchema()

# Show the first 5 rows clearly
df.show(5)

[Stage 1:>                                                          (0 + 1) / 1]                                                                                

root
 |-- id: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- City: string (nullable = true)
 |-- Profession: string (nullable = true)
 |-- Academic Pressure: double (nullable = true)
 |-- Work Pressure: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Study Satisfaction: double (nullable = true)
 |-- Job Satisfaction: double (nullable = true)
 |-- Sleep Duration: string (nullable = true)
 |-- Dietary Habits: string (nullable = true)
 |-- Degree: string (nullable = true)
 |-- Have you ever had suicidal thoughts ?: string (nullable = true)
 |-- Work/Study Hours: double (nullable = true)
 |-- Financial Stress: double (nullable = true)
 |-- Family History of Mental Illness: string (nullable = true)
 |-- Depression: integer (nullable = true)

+---+------+----+-------------+----------+-----------------+-------------+----+------------------+----------------+-----------------+--------------+-------+----------------------

# Remove Non Student Rows and Columns:

In [6]:
# Check the distinct professions before removal
df.select("Profession").distinct().show()

# Keep only rows where Profession is "Student"
df_students = df.filter(df["Profession"] == "Student")

# Verify clearly that only students remain
df_students.select("Profession").distinct().show()

# Check new row count (should be smaller than before)
print(f"New row count after filtering students: {df_students.count()}")


+--------------------+
|          Profession|
+--------------------+
|             Student|
|      Civil Engineer|
|              Lawyer|
|             Teacher|
|                Chef|
|           Architect|
|      UX/UI Designer|
|        Entrepreneur|
|    Digital Marketer|
|          Pharmacist|
|              Doctor|
|Educational Consu...|
|             Manager|
|      Content Writer|
+--------------------+

+----------+
|Profession|
+----------+
|   Student|
+----------+

New row count after filtering students: 27870


In [7]:
# Drop irrelevant columns
columns_to_remove = ["Work Pressure", "Job Satisfaction", "Profession"]  # remove "Proffession" as well, since it's now redundant (all are students)
df_students_clean = df_students.drop(*columns_to_remove)

# Verify clearly columns are removed
df_students_clean.printSchema()


root
 |-- id: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- City: string (nullable = true)
 |-- Academic Pressure: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Study Satisfaction: double (nullable = true)
 |-- Sleep Duration: string (nullable = true)
 |-- Dietary Habits: string (nullable = true)
 |-- Degree: string (nullable = true)
 |-- Have you ever had suicidal thoughts ?: string (nullable = true)
 |-- Work/Study Hours: double (nullable = true)
 |-- Financial Stress: double (nullable = true)
 |-- Family History of Mental Illness: string (nullable = true)
 |-- Depression: integer (nullable = true)



In [8]:
# Display first 5 rows to ensure dataset looks correct
df_students_clean.show(5)

# Verify again missing values for cleanliness check
from pyspark.sql.functions import col, isnan, when, count

df_students_clean.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in df_students_clean.columns]).show()


+---+------+----+-------------+-----------------+----+------------------+-----------------+--------------+-------+-------------------------------------+----------------+----------------+--------------------------------+----------+
| id|Gender| Age|         City|Academic Pressure|CGPA|Study Satisfaction|   Sleep Duration|Dietary Habits| Degree|Have you ever had suicidal thoughts ?|Work/Study Hours|Financial Stress|Family History of Mental Illness|Depression|
+---+------+----+-------------+-----------------+----+------------------+-----------------+--------------+-------+-------------------------------------+----------------+----------------+--------------------------------+----------+
|  2|  Male|33.0|Visakhapatnam|              5.0|8.97|               2.0|        5-6 hours|       Healthy|B.Pharm|                                  Yes|             3.0|             1.0|                              No|         1|
|  8|Female|24.0|    Bangalore|              2.0| 5.9|               5.0|   

[Stage 19:>                                                         (0 + 1) / 1]

+---+------+---+----+-----------------+----+------------------+--------------+--------------+------+-------------------------------------+----------------+----------------+--------------------------------+----------+
| id|Gender|Age|City|Academic Pressure|CGPA|Study Satisfaction|Sleep Duration|Dietary Habits|Degree|Have you ever had suicidal thoughts ?|Work/Study Hours|Financial Stress|Family History of Mental Illness|Depression|
+---+------+---+----+-----------------+----+------------------+--------------+--------------+------+-------------------------------------+----------------+----------------+--------------------------------+----------+
|  0|     0|  0|   0|                0|   0|                 0|             0|             0|     0|                                    0|               0|               3|                               0|         0|
+---+------+---+----+-----------------+----+------------------+--------------+--------------+------+--------------------------------

                                                                                

In [9]:
df.groupBy("Depression").count().show()

+----------+-----+
|Depression|count|
+----------+-----+
|         1|16336|
|         0|11565|
+----------+-----+

