In [27]:
from google.colab import drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from scipy.stats import chi2_contingency, ttest_ind
from statsmodels.formula.api import ols
import statsmodels.api as sm
import openpyxl

# 1Ô∏è‚É£ Primeiro passo: Conectar ao Google Drive para pegar meus dados
#    Vou usar o Google Drive pra guardar meus arquivos e resultados.
drive.mount('/content/drive')

# 2Ô∏è‚É£ Definindo os caminhos dos arquivos
#    Aqui onde est√° o arquivo CSV com meus dados e onde vou salvar tudo.
# Updated file path, assuming the CSV file is located at the root of 'Students_depression'
file_path = "/content/drive/MyDrive/Students_depression/student_dataset.csv"
output_dir = "/content/drive/MyDrive/Students_depression"

df = pd.read_csv(file_path)
print("‚úÖ Arquivo CSV carregado com sucesso!")

# 4Ô∏è‚É£ Dando uma olhada nas primeiras linhas
print("\nüîé As primeiras 5 linhas do meu DataFrame:")
print(df.head())

# 5Ô∏è‚É£ Informa√ß√µes b√°sicas sobre o DataFrame
#    Aqui eu vejo o tipo de cada coluna, se tem algum valor faltando e outras coisas importantes.
print("\n‚ÑπÔ∏è Informa√ß√µes gerais do meu DataFrame:")
df.info()

# üìå 6. Tratar valores ausentes
print("\nüîπ Valores nulos por coluna:")
print(df.isnull().sum())

# Preencher valores nulos (exemplo: substituir NaN por m√©dia ou moda)
numeric_df = df.select_dtypes(include=['number'])
df[numeric_df.columns] = df[numeric_df.columns].fillna(numeric_df.mean())

# üìå 7. Estat√≠sticas descritivas
print("\nüîπ Estat√≠sticas descritivas:")
print(df.describe())

# üìå 8. Identificar outliers visualmente
plt.figure(figsize=(10,5))
sns.boxplot(data=df.select_dtypes(include=['number']))
plt.title("Distribui√ß√£o de Outliers")
plt.show()

# üìå 9. Distribui√ß√£o das vari√°veis categ√≥ricas
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    plt.figure(figsize=(8,4))
    sns.countplot(y=df[col], order=df[col].value_counts().index)
    plt.title(f"Distribui√ß√£o de {col}")
    plt.show()

# üìå 10. Correla√ß√£o entre vari√°veis num√©ricas
plt.figure(figsize=(10,6))
# Calculo da correla√ß√£o s√≥ das colunas numericas
numeric_df = df.select_dtypes(include=['number'])
sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Mapa de Correla√ß√£o")
plt.show()

# 2.1 Vari√°veis Num√©ricas

numerical_cols = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA',
                    'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours']

# Histograms
df[numerical_cols].hist(figsize=(12, 10))
plt.tight_layout()
plt.show()

# Boxplots
for col in numerical_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot de {col}')
    plt.show()

# Estat√≠sticas Descritivas
print(df[numerical_cols].describe())

# 2.2 Vari√°veis Categ√≥ricas
categorical_cols = ['Gender', 'City', 'Profession', 'Sleep Duration', 'Dietary Habits',
                    'Degree', 'Have you ever had suicidal thoughts ?', 'Financial Stress',
                    'Family History of Mental Illness']

# Bar plots
for col in categorical_cols:
    plt.figure(figsize=(8, 6))
    df[col].value_counts().plot(kind='bar')
    plt.title(f'Gr√°fico de barras para {col}')
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()


# 3. An√°lise de Correla√ß√£o
# 3.1 Matriz de Correla√ß√£o (Num√©rica)
corr_matrix = df[numerical_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Matriz de Correla√ß√£o de Vari√°veis Num√©ricas")
plt.show()

# 3.2 Correla√ß√£o com a vari√°vel Depress√£o (Num√©rica)
corr_depression = df[numerical_cols].corrwith(df['Depression']).sort_values(ascending=False)
print("\nCorrela√ß√£o com a vari√°vel Depress√£o:")
print(corr_depression)

# 3.3 Boxplots para comparar vari√°veis num√©ricas e categ√≥ricas
for cat_col in categorical_cols:
      for num_col in numerical_cols:
        plt.figure(figsize=(8, 6))
        sns.boxplot(x=cat_col, y=num_col, data=df)
        plt.title(f'Boxplot de {num_col} por {cat_col}')
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        plt.show()



# 4. Modelagem Preditiva
# 4.1 Pr√©-processamento dos dados

# Codificar vari√°veis categ√≥ricas
label_encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

# Separar vari√°veis preditoras e vari√°vel alvo
X = df.drop(['Depression', 'id', 'Job Satisfaction', 'Work Pressure'], axis=1)
y = df['Depression']

# Dividir os dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4.2 Treinamento do modelo de Regress√£o Log√≠stica
logistic_model = LogisticRegression(random_state=42, solver='liblinear')
logistic_model.fit(X_train, y_train)

# 4.3 Avalia√ß√£o do modelo
y_pred = logistic_model.predict(X_test)
print("\nRelat√≥rio de Classifica√ß√£o do Modelo de Regress√£o Log√≠stica:")

# Store the classification report output in a variable
classification_rep = classification_report(y_test, y_pred)

print(classification_rep)


# 5. Segmenta√ß√£o
# 5.1 Sele√ß√£o das vari√°veis para a segmenta√ß√£o
cluster_vars = ['Age', 'Academic Pressure', 'CGPA', 'Study Satisfaction', 'Work/Study Hours', 'Depression']
X_cluster = df[cluster_vars].copy()

# 5.2 Definindo o n√∫mero de clusters
kmeans = KMeans(n_clusters=3, init='k-means++', random_state=42, n_init=10)

# 5.3 Aplicando o K-means e adicionando os clusters ao dataframe
df['cluster'] = kmeans.fit_predict(X_cluster)

# Visualizando os clusters
for col in cluster_vars:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x='cluster', y=col, data=df)
    plt.title(f"Boxplot de {col} por Cluster")
    plt.show()


# Analisando a distribui√ß√£o de depress√£o por cluster
print("\nDistribui√ß√£o da Depress√£o por Cluster:")
cluster_depression_dist = df.groupby('cluster')['Depression'].value_counts(normalize=True) # Assigning the result to cluster_depression_dist
print(cluster_depression_dist)


# 6. Teste de Hip√≥teses

# 6. Teste de Hip√≥teses

# 6.1 Teste t para compara√ß√£o de m√©dias (ex: press√£o acad√™mica por g√™nero)
grouped_data_academic = df.groupby('Gender')['Academic Pressure']

# Initialize an empty list to store t-test results
ttest_results = []

# Execute teste t para todos os grupos poss√≠veis
groups = df['Gender'].unique()
for i in range(len(groups)):
    for j in range(i + 1, len(groups)):
        group1 = grouped_data_academic.get_group(groups[i])
        group2 = grouped_data_academic.get_group(groups[j])
        t_stat, p_value = ttest_ind(group1, group2)

        # Append the results to the list
        ttest_results.append(f"Teste t para Press√£o Acad√™mica entre {groups[i]} e {groups[j]}: Estat√≠stica t: {t_stat:.2f}, p-valor: {p_value:.3f}")

        print(f"\nTeste t para Press√£o Acad√™mica entre {groups[i]} e {groups[j]}:")
        print(f"  Estat√≠stica t: {t_stat:.2f}, p-valor: {p_value:.3f}")

numeric_stats = df.describe()

# Define the output path for the xls file
output_xls_path = "/content/drive/MyDrive/Students_depression/analysis_results.xls"  # or any desired path within your output directory

# Preparando os dados para salvar
analysis_results = {
    'Description': ['Correla√ß√£o com a vari√°vel Depress√£o'] + ['Estatisticas descritivas'] +
                   ['Teste Qui-Quadrado para Depress√£o e Hist√≥rico Familiar'] + ['Relat√≥rio de Classifica√ß√£o do Modelo de Regress√£o Log√≠stica'] +
                    ['Distribui√ß√£o da Depress√£o por Cluster'] + ['Testes T'],
    'Result': [corr_depression.to_dict(), numeric_stats.to_dict(), # Now numeric_stats is defined
             f"chi2 = {chi2:.2f}, p = {p:.3f}",
              classification_rep, cluster_depression_dist.to_dict(),
              ttest_results] # Salvando resultados dos testes t e dos testes de qui-quadrado
}

# Criar DataFrame a partir de dictionary
results_df = pd.DataFrame(analysis_results)

# Replace 'results_df.to_xls' with 'results_df.to_excel' and change the file extension to '.xlsx'
results_df.to_excel(output_xls_path.replace(".xls", ".xlsx"), index=False)  # Changed to to_excel and updated file extension
print(f"\n‚úÖ Resultados da an√°lise salvos em: {output_xls_path.replace('.xls', '.xlsx')}") # Updated print statement


Output hidden; open in https://colab.research.google.com to view.