In [None]:
import numpy as np
import pandas as pd   

In [None]:
#importing the dataset
data = pd.read_csv('Students Social Media Addiction.csv')

In [None]:
#checking the shape of the dataset
data.shape

In [None]:
#viewing the first 5 rows and random 5 rows of the dataset
data.head()
data.sample(5)

In [None]:
#getting the information about the dataset
data.info()

In [None]:
#checking for missing values
data.isnull().sum()

In [None]:
#getting the statistical measures about the dataset
data.describe()

In [None]:
#checking for duplicate values
data.duplicated().sum()

In [None]:
#finding the correlation between the columns
data_corr = data[['Age', 'Avg_Daily_Usage_Hours', 'Sleep_Hours_Per_Night','Mental_Health_Score']]
print(data_corr.corr())

EDA-1: Univariate Analysis 

Performing the analysis in each column individually to find the types of values, need for transformation or not and to find the outliers.  

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

1. Categorical Data

In [None]:
data_categorial = data.select_dtypes(include=['object'])
print(data_categorial.nunique())

In [None]:
#Counterplot for Most Used Platform
sns.countplot(data['Most_Used_Platform'], palette='viridis', order=data['Most_Used_Platform'].value_counts().index)

In [None]:
#Pie chart for Academic Level Distribution
fig = plt.figure(figsize=(13, 10))
axes = fig.subplots(2, 2)
plt.suptitle('Categorial Data Distribution', fontsize=16)

axes[0,0].set_title('Academic Level Distribution')
data['Academic_Level'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=sns.color_palette('pastel'), ax=axes[0,0])

axes[0,1].set_title('Gender Distribution')
data['Gender'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=sns.color_palette('pastel'), ax=axes[0,1])

axes[1,0].set_title('Affects Academic Performance')
data['Affects_Academic_Performance'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=sns.color_palette('pastel'), ax=axes[1,0])

axes[1,1].set_title('Relationship Status Distribution')
data['Relationship_Status'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=sns.color_palette('pastel'), ax=axes[1,1])

2. Numerical Data

In [None]:
data_categorial = data.select_dtypes(include=['int', 'float'])
print(data_categorial.nunique())

In [None]:
#Histogram for Conflicts over social media Distribution
plt.hist(data['Conflicts_Over_Social_Media'], bins=5, color='skyblue', edgecolor='black')
plt.xlabel('Conflicts Over Social Media')
plt.ylabel('Number of Students')
plt.title('Distribution of Conflicts Over Social Media')
plt.show()

In [None]:
#DistPlot for Sleep Hours Distribution
sns.distplot(data['Sleep_Hours_Per_Night'], kde=True, bins=7, color='Green')
plt.title('Distribution of Sleep Hours Per Night')
plt.xlabel('Sleep Hours Per Night')
plt.ylabel('Density')
plt.show()

In [None]:
#Boxplot for Avg Daily Usage Hours
fig= plt.figure(figsize=(13,10))
axes = fig.subplots(2,2)
sns.boxplot(x=data['Avg_Daily_Usage_Hours'], color='lightblue', ax=axes[0,0])
sns.boxplot(x=data['Age'], color='lightblue', ax=axes[0,1])
sns.boxplot(x=data['Mental_Health_Score'], color='lightblue', ax=axes[1,0])
sns.boxplot(x=data['Addicted_Score'], color='lightblue', ax=axes[1,1])

EDA-2 : Bivariate Analysis 

Performing the analysis betweene the columns to find the trends, correlation and hidden patterns.

In [None]:
data.info()

1. Plotting the relations between the numerical and numerical Data

In [None]:
sns.scatterplot(x= data['Mental_Health_Score'], y = data['Addicted_Score'], hue=data['Gender'])

In [None]:
sns.lineplot(x= data['Avg_Daily_Usage_Hours'], y = data['Addicted_Score'], hue=data['Gender'], palette='deep')

2. Plotting the relations between the numerical and categorical Data

In [None]:
fig = plt.figure(figsize=(13,10))
axes = fig.subplots(2,2)
sns.boxplot(x=data['Gender'], y=data['Avg_Daily_Usage_Hours'], palette='Set2', ax=axes[0,0])
sns.barplot(x=data['Gender'], y= data['Mental_Health_Score'], palette='Set2', ax=axes[0,1])
sns.barplot(x=data['Gender'], y= data['Conflicts_Over_Social_Media'], palette='Set2', ax=axes[1,0])
sns.histplot(x=data['Gender'], y=data['Addicted_Score'], palette='Set2', ax=axes[1,1])

These Plots are showing that females are using social more than males due to which there mental health is affected more than the males and hence they have more conflicts in the social media than the males. Eventually it suggests that females are more addicted to the social media than the males are.

3. Plotting the relations between the categorical and categorical Data

In [None]:
sns.heatmap(data=data_corr.corr(), annot=True, cmap='YlGnBu')
sns.clustermap(data=data_corr.corr(), annot=True, cmap='YlGnBu')

We can also use the pandas profiller to automatically do the EDA and show in the html format

In [None]:
# from pandas_profiling import ProfileReport
# profile = ProfileReport(data, title="Students Social Media Addiction Data Profiling Report")
# profile.to_file("students_social_media_addiction_report.html")

Feature Engineering

In [None]:
data.info()

In [None]:
# Looking for the unique values and their counts in categorical columns
print(data['Academic_Level'].unique())
print(data['Academic_Level'].value_counts().unique())
print(data['Most_Used_Platform'].unique())
print(data['Most_Used_Platform'].value_counts().unique())
print(data['Relationship_Status'].unique())
print(data['Relationship_Status'].value_counts().unique())

In [None]:
# One-Hot Encoding of Categorical Columns and Dropping Unnecessary Columns having multiple unique values
data_one_hot = pd.get_dummies(data[['Academic_Level', 'Most_Used_Platform', 'Relationship_Status']], drop_first=False, dtype=int)
data = pd.concat([data, data_one_hot], axis=1)
data.drop(columns= ['Student_ID', 'Most_Used_Platform', 'Academic_Level', 'Relationship_Status'], inplace=True)
data.info()

In [None]:
# Encoding the Categorical columns to numerical values having only two unique values
data = data.replace({'Gender': {'Male': 0, 'Female': 1}})
data = data.replace({'Affects_Academic_Performance': {'No': 0, 'Yes': 1}})
data.head()

In [None]:
#Scaling the features using Min-Max Scaler from sklearn
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

data[['Age', 'Avg_Daily_Usage_Hours', 'Sleep_Hours_Per_Night', 'Mental_Health_Score', 'Conflicts_Over_Social_Media', 'Addicted_Score']] = scaler.fit_transform(data[['Age', 'Avg_Daily_Usage_Hours', 'Sleep_Hours_Per_Night', 'Mental_Health_Score', 'Conflicts_Over_Social_Media', 'Addicted_Score']])

data.head()

We used Min-Max Scaling since we know the min and max values of the data and also we don't have much outliers present in the data. Now Let's check the country column

In [None]:
data['Country'].value_counts()

Since Country has too many unique values, we will drop this column as it may not contribute significantly to the model performance.


In [None]:
data.drop(columns=['Country'], inplace=True)
data.info()

In [None]:
data.sample(10)