In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
import warnings
warnings.filterwarnings("ignore")



In [2]:
df = pd.read_csv("Dataset_14-day_AA_depression_symptoms_mood_and_PHQ-9.csv", encoding = "latin-1", index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16150 entries, 1 to 16150
Data columns (total 35 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   user_id          16150 non-null  int64  
 1   phq1             15864 non-null  float64
 2   phq2             15613 non-null  float64
 3   phq3             15613 non-null  float64
 4   phq4             15613 non-null  float64
 5   phq5             15613 non-null  float64
 6   phq6             15613 non-null  float64
 7   phq7             15613 non-null  float64
 8   phq8             15613 non-null  float64
 9   phq9             15613 non-null  float64
 10  age              9694 non-null   float64
 11  sex              14840 non-null  object 
 12  q1               1973 non-null   float64
 13  q2               1763 non-null   float64
 14  q3               1924 non-null   float64
 15  q4               1713 non-null   float64
 16  q5               1991 non-null   float64
 17  q6          

In [3]:
df_copy = df.copy()

In [4]:
df.drop("id", inplace=True, axis = 1)
df.describe()

Unnamed: 0,user_id,phq1,phq2,phq3,phq4,phq5,phq6,phq7,phq8,phq9,...,q10,q11,q12,q13,q14,q16,q46,q47,happiness.score,phq.day
count,16150.0,15864.0,15613.0,15613.0,15613.0,15613.0,15613.0,15613.0,15613.0,15613.0,...,1843.0,1802.0,1776.0,3191.0,3841.0,1924.0,1807.0,2007.0,16150.0,16150.0
mean,96.794799,1.762544,2.06802,2.077115,2.396208,1.584961,1.848139,1.777813,1.044066,1.261705,...,1.76777,2.008879,2.168356,1.679724,0.820359,1.744283,2.175982,2.229198,1.814551,7.433062
std,54.153995,0.983393,1.030404,0.991805,0.837374,1.153328,1.20287,1.065502,1.01063,1.129255,...,1.724607,1.680118,1.635635,1.683283,1.362829,1.613481,1.700439,1.664025,0.990936,52.158329
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-116.797153
25%,52.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-13.431889
50%,102.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,1.0,1.0,...,2.0,2.0,3.0,1.0,0.0,2.0,3.0,3.0,2.0,-2.38761
75%,141.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,...,4.0,4.0,4.0,3.0,2.0,3.0,4.0,4.0,3.0,15.905327
max,185.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,211.963981


In [5]:
drop_columns = ['user_id', 'happiness.score', 'sex', 'time', 'period.name', 'start.time', 'phq.day']
df.drop(columns=drop_columns, inplace = True)

In [6]:
# define the knn imputer
imp = KNNImputer(n_neighbors=2, weights='uniform')

# fit the imputer to the data and transform to fill in missing values
df_imputed = imp.fit_transform(df)

# replace numpy array with dataframe
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

In [7]:
df = df_imputed.round().astype(int)
# update the 'col2' column in table2 with the values from table1
df_copy.update(df[['age']])

In [8]:
from sklearn.decomposition import PCA
# perform PCA with 1 principal component
pca = PCA(n_components=1)
reduced_data = pca.fit_transform(df)

# create a dataframe from the reduced data
reduced_df = pd.DataFrame(reduced_data, columns=['PC1'])

In [9]:
keep_cols = ['user_id', 'happiness.score', 'sex', 'time', 'period.name', 'start.time', 'phq.day']
df_copy = df_copy[keep_cols]
dep_df = pd.concat([reduced_df, df_copy], axis=1)
dep_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16151 entries, 0 to 16150
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PC1              16150 non-null  float64
 1   user_id          16150 non-null  float64
 2   happiness.score  16150 non-null  float64
 3   sex              14840 non-null  object 
 4   time             16150 non-null  object 
 5   period.name      16150 non-null  object 
 6   start.time       16150 non-null  object 
 7   phq.day          16150 non-null  float64
dtypes: float64(4), object(4)
memory usage: 1.1+ MB


In [10]:
#dep_df.to_csv("depression_symptoms.csv", index=False)

In [11]:
drop_cols = ['time', 'period.name', 'start.time']
dep_df.drop(columns = drop_cols, inplace=True)

In [12]:
dep_df['sex'].isnull().sum()

1311

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB

# Seperate the null values from the df and create a variable "test_data"
test_data = dep_df[dep_df['sex'].isnull()]

# Drop the null values from the dataframe and represent them
dep_df.dropna(inplace=True)

# Create x_train and y_train from train data
X = dep_df.drop('sex', axis=1)
y = dep_df['sex']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.0883, random_state = 81)

# Feature scaling to the training and test set of independent variables
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Training the Naive Bayes model on the training set
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Let’s predict the test results
y_pred = classifier.predict(X_test)
test_data['sex'] = y_pred

y_pred.size

1311

In [14]:
for i, row in enumerate(dep_df):
    for j, value in enumerate(row):
        if value is None:
            dep_df[i][j] = test_data[i][j]

In [16]:
dep_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14839 entries, 52 to 16149
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PC1              14839 non-null  float64
 1   user_id          14839 non-null  float64
 2   happiness.score  14839 non-null  float64
 3   sex              14839 non-null  object 
 4   phq.day          14839 non-null  float64
dtypes: float64(4), object(1)
memory usage: 695.6+ KB
