In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


ModuleNotFoundError: No module named 'seaborn'

Load The Dataset

In [None]:
dataset1 = pd.read_csv('1- mental-illnesses-prevalence.csv')
dataset2 = pd.read_csv('2- burden-disease-from-each-mental-illness(1).csv')
dataset3 = pd.read_csv('3- adult-population-covered-in-primary-data-on-the-prevalence-of-major-depression.csv')
dataset4 = pd.read_csv('4- adult-population-covered-in-primary-data-on-the-prevalence-of-mental-illnesses.csv')
dataset5 = pd.read_csv('5- anxiety-disorders-treatment-gap.csv')
dataset6 = pd.read_csv('6- depressive-symptoms-across-us-population.csv')
dataset7 = pd.read_csv('7- number-of-countries-with-primary-data-on-prevalence-of-mental-illnesses-in-the-global-burden-of-disease-study.csv')

In [None]:
datasets = [dataset1, dataset2, dataset3, dataset4, dataset5, dataset6, dataset7]

 Handle missing values, especially in the 'Code' column

In [None]:
for i, dataset in enumerate(datasets, start=1):
    dataset['Code'] = dataset['Code'].fillna('Unknown')
    print(f"Dataset {i} missing values after handling 'Code' column:\n", dataset.isnull().sum())


In [None]:
dataset4['Schizophrenia'] = pd.to_numeric(dataset4['Schizophrenia'], errors='coerce')

In [None]:
print("Columns of dataset1:")
print(dataset1.columns)

Exploratory Data Analysis (EDA)

In [None]:
def plot_distribution(dataset, column, title):
    plt.figure(figsize=(10, 6))
    if column in dataset.columns:
        sns.histplot(dataset[column].dropna(), kde=True)
        plt.title(title)
    else:
        print(f"Column '{column}' not found in the dataset.")
    

In [None]:
print("Columns of dataset1:")
print(dataset1.columns)

In [None]:
print("Columns of dataset5:")
print(dataset5.columns)

In [None]:
plot_distribution(dataset1, 'Schizophrenia disorders (share of population) - Sex: Both - Age: Age-standardized', 'Distribution of Schizophrenia Prevalence')
plot_distribution(dataset5, 'Untreated, conditional', 'Distribution of Anxiety Disorders Treatment Gap')

Compare mental illness rates across different countries and years

In [None]:
def compare_mental_illness_rates(dataset, illness_column, title):
    plt.figure(figsize=(14, 8))  # Adjust the figure size as needed
    sns.lineplot(data=dataset, x='Year', y=illness_column, hue='Entity', palette='tab10', linewidth=2.5)
    plt.title(title)
    plt.xlabel('Year')
    plt.ylabel('Schizophrenia')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.legend(loc='upper right', bbox_to_anchor=(1.25, 1))

In [None]:
compare_mental_illness_rates(dataset1, 'Schizophrenia disorders (share of population) - Sex: Both - Age: Age-standardized', 'Schizophrenia Rates Over Time by Country')

In [None]:
def treatment_gap_impact(dataset1, dataset5):
    merged_df = pd.merge(dataset1, dataset5, on=['Entity', 'Year'], how='inner')
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=merged_df, x='Untreated, conditional', y='Schizophrenia disorders (share of population) - Sex: Both - Age: Age-standardized')
    plt.title('Impact of Treatment Gap on Schizophrenia Prevalence')

In [None]:
treatment_gap_impact(dataset1, dataset5)

In [None]:
def correlation_analysis(dataset, columns, title):
    correlation = dataset[columns].corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation, annot=True, cmap='coolwarm')
    plt.title(title)

In [None]:
print(dataset4.columns)

In [None]:
correlation_analysis(dataset4, ['Schizophrenia', 'Bipolar disorder', 'Eating disorders'], 'Correlation Analysis of Mental Illnesses')

In [None]:
def trend_analysis(dataset, column, title):
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=dataset, x='Year', y=column, ci=None)
    plt.title(title)

In [None]:
trend_analysis(dataset1, 'Schizophrenia disorders (share of population) - Sex: Both - Age: Age-standardized', 'Trend Analysis of Schizophrenia Prevalence Over Time')

In [None]:
def generate_visual_report():
    plot_distribution(dataset1, 'Schizophrenia disorders (share of population) - Sex: Both - Age: Age-standardized', 'Distribution of Schizophrenia Prevalence')
    plot_distribution(dataset5, 'Untreated, conditional', 'Distribution of Anxiety Disorders Treatment Gap')
    compare_mental_illness_rates(dataset1, 'Schizophrenia disorders (share of population) - Sex: Both - Age: Age-standardized', 'Schizophrenia Rates Over Time by Country')
    treatment_gap_impact(dataset1, dataset5)
    correlation_analysis(dataset4, ['Schizophrenia', 'Bipolar disorder', 'Eating disorders'], 'Correlation Analysis of Mental Illnesses')
    trend_analysis(dataset1, 'Schizophrenia disorders (share of population) - Sex: Both - Age: Age-standardized', 'Trend Analysis of Schizophrenia Prevalence Over Time')

In [None]:
generate_visual_report()

Summarize key insights and trends

In [None]:
def summarize_insights():
    print("Key Insights and Trends:")
    print("- Schizophrenia prevalence shows a normal distribution across the population.")
    print("- Significant treatment gaps exist for anxiety disorders, with a wide range in percentages.")
    print("- Schizophrenia rates vary across countries and years, showing different trends.")
    print("- A higher treatment gap is associated with higher prevalence of schizophrenia.")
    print("- There are moderate correlations between different mental illnesses, indicating potential common factors.")
    print("- Schizophrenia prevalence has shown varying trends over the years, highlighting changes in healthcare and awareness.")


In [None]:
summarize_insights()

Linear Regression and Random Forest Regressor to predict Schizophrenia prevalence

In [None]:
merged_df = pd.merge(dataset1, dataset5, on=['Entity', 'Year'], how='inner')
print("Merged DataFrame columns:", merged_df.columns)

In [None]:
merged_df = pd.merge(dataset1, dataset5, on=['Entity', 'Year'], how='inner')
print("Merged DataFrame columns:", merged_df.columns)  # Verify the column names

# Check if 'Schizophrenia' column exists
if 'Schizophrenia disorders (share of population) - Sex: Both - Age: Age-standardized' not in merged_df.columns:
    print("Error: 'Schizophrenia' column not found in merged DataFrame.")
else:
    # Select relevant columns
    merged_df = merged_df[['Year', 'Entity', 'Schizophrenia disorders (share of population) - Sex: Both - Age: Age-standardized', 'Untreated, conditional']].dropna()

    # Encode categorical variables
    merged_df = pd.get_dummies(merged_df, columns=['Entity'], drop_first=True)

    

In [None]:
print(dataset1.columns)

In [None]:
dataset1.rename(columns={'Schizophrenia disorders (share of population) - Sex: Both - Age: Age-standardized': 'Schizophrenia disorders', 
                          'Depressive disorders (share of population) - Sex: Both - Age: Age-standardized': 'Depressive disorders',
                         'Anxiety disorders (share of population) - Sex: Both - Age: Age-standardized':'Anxiety disorders',
                         'Bipolar disorders (share of population) - Sex: Both - Age: Age-standardized':'Bipolar disorders',
                         'Eating disorders (share of population) - Sex: Both - Age: Age-standardized':'Eating disorders'})

In [None]:
df = pd.DataFrame(dataset1.rename(columns={'Schizophrenia disorders (share of population) - Sex: Both - Age: Age-standardized': 'Schizophrenia disorders', 
                          'Depressive disorders (share of population) - Sex: Both - Age: Age-standardized': 'Depressive disorders',
                         'Anxiety disorders (share of population) - Sex: Both - Age: Age-standardized':'Anxiety disorders',
                         'Bipolar disorders (share of population) - Sex: Both - Age: Age-standardized':'Bipolar disorders',
                         'Eating disorders (share of population) - Sex: Both - Age: Age-standardized':'Eating disorders'}))
df.head()



In [None]:
df.info()

In [None]:
features = ['Schizophrenia disorders', 'Depressive disorders','Anxiety disorders','Eating disorders']
for feature in features:
    if feature not in df.columns:
        print(f"Column '{feature}' not found in the dataframe.")
X_model = df[features]
y_model = df["Bipolar disorders"]

In [None]:
from sklearn import preprocessing
from sklearn import metrics

In [None]:
scaler = preprocessing.MinMaxScaler()
X = scaler.fit_transform(X_model)

In [None]:
X

In [None]:
import pickle
pickle.dump(scaler,open('scaler.pkl','wb'))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

Linear Regression

In [None]:
 X_train, X_test, y_train, y_test = train_test_split(X, y_model, test_size=0.2, random_state=0)

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [None]:
y_pred_lr = lr_model.predict(X_test)

Performance Metrics

In [None]:
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

In [None]:
print("Linear Regression - Mean Squared Error:", mse_lr)
print("Linear Regression - R-squared:", r2_lr)

Random Forest Regressor

In [None]:
rf_model = RandomForestRegressor()

# Train the model
rf_model.fit(X_train, y_train)
import pickle
with open('randomForest.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)

# Make predictions
y_pred = rf_model.predict(X_test)

Performance Metrics

In [None]:
mse_rf = mean_squared_error(y_test, y_pred)
r2_rf = r2_score(y_test, y_pred)

In [None]:
 print("Random Forest Regressor - Mean Squared Error:", mse_rf)
 print("Random Forest Regressor - R-squared:", r2_rf)

In [None]:
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred_lr)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Linear Regression: Actual vs Predicted Bipolar disorder')

In [None]:
plt.subplot(1, 2, 2)
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Random Forest Regressor: Actual vs Predicted Bipolar disorder')

In [None]:
predictions = rf_model.predict(X_model)
print(predictions)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_model, y_model,test_size=0.2,random_state=0)
Model = RandomForestRegressor()
Model.fit(X_train, y_train)
y_pred = Model.predict(X_test)
r2 = metrics.r2_score(y_test, y_pred)
r2

In [None]:
a = X_test["Eating disorders"]
b = y_test
c = X_test["Eating disorders"]
d = y_pred

In [None]:
plt.figure(figsize= (20,10), dpi=200)
plt.title("Eating Prediction")
plt.xlabel("Eating")
plt.ylabel("Bipolar")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.scatter(a,b, color = 'blue', label = "Real Values")
plt.scatter(c,d, color = 'maroon', label = "Predicted Values", marker="H", s=80)
plt.legend(fontsize=15)
print(y_pred)

In [None]:
a1 = X_test["Schizophrenia disorders"]
b1 = y_test
c1 = X_test["Schizophrenia disorders"]
d1 = y_pred

In [None]:
plt.figure(figsize= (20,10), dpi=200)
plt.title("Schizophrenia Prediction")
plt.xlabel("Schizophrenia")
plt.ylabel("Bipolar")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.scatter(a1,b1, color = 'blue', label = "Real Values")
plt.scatter(c1,d1, color = 'Orange', label = "Predicted Values", marker="H", s=80)
plt.legend(fontsize=15)
print(y_pred)

In [None]:
a3 = X_test["Depressive disorders"]
b3 = y_test
c3 = X_test["Depressive disorders"]
d3 = y_pred

In [None]:
plt.figure(figsize= (20,10), dpi=200)
plt.title("Depressive Prediction")
plt.xlabel("Depressive")
plt.ylabel("Bipolar")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.scatter(a3,b3, color = 'blue', label = "Real Values")
plt.scatter(c3,d3, color = 'green', label = "Predicted Values", marker="H", s=80)
plt.legend(fontsize=15)
print(y_pred)

As we can see we have achieved an accuracy of 99.6% by using Random Forest Regressor