In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.utils import resample
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
def plot_log_odds(results_df, title, filename):
    """
    Generates and saves a plot of log odds ratios with 95% confidence intervals.

    Args:
        results_df (pd.DataFrame): DataFrame with columns 'param', 'coef', 'conf_lower', 'conf_upper'.
        title (str): The title for the plot.
        filename (str): The path to save the plot image.
    """
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(10, 6))

    # Create the error bar plot
    ax.errorbar(
        x=results_df['coef'],
        y=results_df['param'],
        xerr=[results_df['coef'] - results_df['conf_lower'], results_df['conf_upper'] - results_df['coef']],
        fmt='o',
        color='darkblue',
        ecolor='skyblue',
        elinewidth=3,
        capsize=5,
        markersize=8
    )

    # Add a vertical line at 0 for reference
    ax.axvline(x=0, linestyle='--', color='grey', linewidth=1)

    # Set labels and title
    ax.set_title(title, fontsize=16, pad=20)
    ax.set_xlabel('Log Odds Ratio (Coefficient)', fontsize=12)
    ax.set_ylabel('Personality Trait', fontsize=12)
    
    # Invert y-axis to have a more intuitive top-to-bottom reading
    ax.invert_yaxis()
    
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()
    print(f"Plot saved to: {filename}")


In [5]:
print("--- Starting Logistic Regression Analysis ---")

# --- Configuration ---
BASE_PATH = "/data2/julina/scripts/tweets/2020/03/"
ANALYSIS_DIR = os.path.join(BASE_PATH, "SU_and_NON_SU_analysis/")
INPUT_FILE = os.path.join(ANALYSIS_DIR, "all_users_classified_with_personality.csv")

PERSONALITY_TRAITS = ['cOPN', 'cCON', 'cEXT', 'cAGR', 'cNEU']
TARGET_VARIABLE = 'DrugAbuse'

# Dictionary to map short trait names to full names for plotting
trait_name_map = {
    'cOPN': 'Openness',
    'cCON': 'Conscientiousness',
    'cEXT': 'Extraversion',
    'cAGR': 'Agreeableness',
    'cNEU': 'Neuroticism'
}

# --- Step 1: Load Data ---
print(f"\nLoading data from {INPUT_FILE}...")
if not os.path.exists(INPUT_FILE):
    print(f"FATAL ERROR: Input file not found at {INPUT_FILE}.")

--- Starting Logistic Regression Analysis ---

Loading data from /data2/julina/scripts/tweets/2020/03/SU_and_NON_SU_analysis/all_users_classified_with_personality.csv...


In [6]:
df = pd.read_csv(INPUT_FILE)
df.dropna(subset=PERSONALITY_TRAITS + [TARGET_VARIABLE], inplace=True)
print(f"Data shape after dropping NA: {df.shape}")

# --- Step 2: Balance the Dataset ---
df_su = df[df[TARGET_VARIABLE] == 1]
df_non_su = df[df[TARGET_VARIABLE] == 0]

print(f"\nOriginal class distribution:")
print(f"  SU Users (1): {len(df_su)}")
print(f"  NON-SU Users (0): {len(df_non_su)}")

if len(df_su) > len(df_non_su):
    majority_df, minority_df = df_su, df_non_su
else:
    majority_df, minority_df = df_non_su, df_su

majority_downsampled = resample(
    majority_df, replace=False, n_samples=len(minority_df), random_state=42
)
df_balanced = pd.concat([minority_df, majority_downsampled])

print("\nBalanced class distribution for analysis:")
print(df_balanced[TARGET_VARIABLE].value_counts())

Data shape after dropping NA: (407370, 16)

Original class distribution:
  SU Users (1): 340683
  NON-SU Users (0): 66687

Balanced class distribution for analysis:
DrugAbuse
0    66687
1    66687
Name: count, dtype: int64


In [7]:
# --- Step 3: Define Variables ---
y = df_balanced[TARGET_VARIABLE]
X_multi = sm.add_constant(df_balanced[PERSONALITY_TRAITS])

# --- Step 4: Multiple Logistic Regression ---
print("\n\n" + "="*60)
print("  Multiple Logistic Regression: All Traits vs. Substance Use")
print("="*60)
results_for_plotting_multi = pd.DataFrame()
try:
    logit_model_all = sm.Logit(y, X_multi).fit()
    print(logit_model_all.summary())
    
    # Store results for plotting, dropping the 'const'
    params = logit_model_all.params.drop('const')
    conf = logit_model_all.conf_int().drop('const')
    
    results_for_plotting_multi = pd.DataFrame({
        "param": params.index,
        "coef": params.values,
        "conf_lower": conf[0].values,
        "conf_upper": conf[1].values
    })
    # **FIX:** Explicitly map the short names to full names for the plot
    results_for_plotting_multi['param'] = results_for_plotting_multi['param'].map(trait_name_map)
except Exception as e:
    print(f"Could not run multiple logistic regression. Error: {e}")

    # --- Step 5: Individual Logistic Regressions ---
   



  Multiple Logistic Regression: All Traits vs. Substance Use
Optimization terminated successfully.
         Current function value: 0.680091
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:              DrugAbuse   No. Observations:               133374
Model:                          Logit   Df Residuals:                   133368
Method:                           MLE   Df Model:                            5
Date:                Thu, 12 Jun 2025   Pseudo R-squ.:                 0.01884
Time:                        14:19:52   Log-Likelihood:                -90706.
converged:                       True   LL-Null:                       -92448.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.6461      0.047    -13.760  

In [10]:
print("\n\n" + "="*60)
print("  Individual Logistic Regressions: Each Trait vs. Substance Use")
print("="*60)
individual_results_list = []
for trait in PERSONALITY_TRAITS:
    try:
        X_single = sm.add_constant(df_balanced[trait])
        logit_model_single = sm.Logit(y, X_single).fit(disp=0)
        
        coef = logit_model_single.params.values[1]
        conf_lower, conf_upper = logit_model_single.conf_int().iloc[1]
        
        individual_results_list.append({
            "param": trait, # Keep short name for now
            "coef": coef, 
            "conf_lower": conf_lower, 
            "conf_upper": conf_upper
        })
    except Exception as e:
        print(f"Could not run regression for {trait}. Error: {e}")
        
results_for_plotting_individual = pd.DataFrame(individual_results_list)
# **FIX:** Explicitly map the short names to full names for the plot
if not results_for_plotting_individual.empty:
    results_for_plotting_individual['param'] = results_for_plotting_individual['param'].map(trait_name_map)

# --- Step 6: Generate Plots ---
print("\n\n" + "="*60)
print("  Generating and Saving Log Odds Ratio Plots")
print("="*60)

    



  Individual Logistic Regressions: Each Trait vs. Substance Use


  Generating and Saving Log Odds Ratio Plots


In [13]:
results_for_plotting_multi

Unnamed: 0,param,coef,conf_lower,conf_upper
0,Openness,-0.032763,-0.112188,0.046662
1,Conscientiousness,-0.091012,-0.183974,0.001951
2,Extraversion,1.170767,1.092286,1.249249
3,Agreeableness,1.397365,1.311393,1.483337
4,Neuroticism,-1.254253,-1.328036,-1.180469


In [16]:
# Plot for Multiple Regression Model
if not results_for_plotting_multi.empty:
    print("\nData being sent to the MULTIPLE regression plot:")
    print(results_for_plotting_multi)
    plot_log_odds(
        results_df=results_for_plotting_multi,
        title='Log Odds Ratios of Personality Traits on Substance Use\n(Multiple Regression Model)',
        filename=os.path.join('multi_regression_log_odds_.png')
    )
    
    # Plot for Individual Regression Models
    if not results_for_plotting_individual.empty:
        print("\nData being sent to the INDIVIDUAL regression plot:")
        print(results_for_plotting_individual)
        plot_log_odds(
            results_df=results_for_plotting_individual,
            title='Log Odds Ratios of Personality Traits on Substance Use\n(Individual Regression Models)',
            filename=os.path.join('individual_regression_log_odds.png')
        )

    print("\n\n--- Analysis Complete ---")


Data being sent to the MULTIPLE regression plot:
               param      coef  conf_lower  conf_upper
0           Openness -0.032763   -0.112188    0.046662
1  Conscientiousness -0.091012   -0.183974    0.001951
2       Extraversion  1.170767    1.092286    1.249249
3      Agreeableness  1.397365    1.311393    1.483337
4        Neuroticism -1.254253   -1.328036   -1.180469
Plot saved to: multi_regression_log_odds_.png

Data being sent to the INDIVIDUAL regression plot:
               param      coef  conf_lower  conf_upper
0           Openness  0.242357    0.170658    0.314057
1  Conscientiousness -0.193757   -0.278133   -0.109382
2       Extraversion  1.270955    1.193902    1.348008
3      Agreeableness  1.540798    1.458279    1.623317
4        Neuroticism -1.403068   -1.475698   -1.330438
Plot saved to: individual_regression_log_odds.png


--- Analysis Complete ---


In [39]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

def generate_violin_plots():
    """
    This script generates and saves a combined figure of violin plots to visualize 
    the distribution of each personality trait for both Substance User and 
    Non-Substance User groups.

    Steps:
    1. Loads the dataset containing user data with predicted personality traits.
    2. Creates a single figure with a horizontal grid of subplots.
    3. Iterates through each personality trait, creating a violin plot on a 
       dedicated subplot.
    4. Customizes and saves the combined figure as a high-quality PNG image.
    """
    print("--- Starting Violin Plot Generation ---")

    # --- Configuration ---
    BASE_PATH = "/data2/julina/scripts/tweets/2020/03/"
    ANALYSIS_DIR = os.path.join(BASE_PATH, "SU_and_NON_SU_analysis/")
    INPUT_FILE = os.path.join(ANALYSIS_DIR, "all_users_classified_with_personality.csv")
    
    # Check if input file exists
    if not os.path.exists(INPUT_FILE):
        print(f"FATAL ERROR: Input file not found at {INPUT_FILE}.")
        return

    # The personality traits (independent variables)
    PERSONALITY_TRAITS = {
        'cOPN': 'Openness',
        'cCON': 'Conscientiousness',
        'cEXT': 'Extraversion',
        'cAGR': 'Agreeableness',
        'cNEU': 'Neuroticism'
    }
    # The outcome variable (dependent variable)
    TARGET_VARIABLE = 'DrugAbuse'

    # --- Step 1: Load Data ---
    print(f"\nLoading data from {INPUT_FILE}...")
    df = pd.read_csv(INPUT_FILE)
    df.dropna(subset=list(PERSONALITY_TRAITS.keys()) + [TARGET_VARIABLE], inplace=True)
    print(f"Successfully loaded and cleaned {len(df)} rows.")

    # --- Step 2: Generate and Save Combined Plot ---
    print("\nGenerating a combined plot for all personality traits...")
    
    try:
        plt.style.use('seaborn-v0_8-whitegrid')
        # Create a figure with a 1x5 grid of subplots for a horizontal layout
        fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(25, 6))

        # Iterate through each trait and its corresponding subplot axis
        for i, (short_name, full_name) in enumerate(PERSONALITY_TRAITS.items()):
            ax = axes[i]
            
            # Create the violin plot on the current subplot
            sns.violinplot(
                x=TARGET_VARIABLE,
                y=short_name,
                data=df,
                ax=ax,
                palette="muted",
                split=False,  # Set to False for separate violins
                inner="quartile"  # Shows the quartiles inside the violin
            )

            # Customize plot labels and title for each subplot
            ax.set_title(f'Distribution of {full_name}', fontsize=14, pad=10)
            ax.set_xlabel('Group', fontsize=12)
            ax.set_ylabel(f'{full_name} Score', fontsize=12)
            ax.set_xticklabels(['Non-Substance User', 'Substance User'])

        # Add a main title for the entire figure
        # fig.suptitle('Distribution of Personality Traits by Substance Use Group', fontsize=20, y=1.02)
        
        # Adjust layout to prevent titles/labels from overlapping
        plt.tight_layout()
        
        # Save the combined figure
        output_filename = os.path.join('violin_plots_combined_horizontal.png')
        plt.savefig(output_filename, dpi=300)
        plt.close()  # Close the plot to free memory
        
        print(f"\nSuccessfully saved combined plot to: {output_filename}")

    except Exception as e:
        print(f"\nFAILED to generate combined plot. Error: {e}")

    print("\n--- Plot generation complete. ---")


if __name__ == '__main__':
    generate_violin_plots()


--- Starting Violin Plot Generation ---

Loading data from /data2/julina/scripts/tweets/2020/03/SU_and_NON_SU_analysis/all_users_classified_with_personality.csv...
Successfully loaded and cleaned 407370 rows.

Generating a combined plot for all personality traits...

Successfully saved combined plot to: violin_plots_combined_horizontal.png

--- Plot generation complete. ---
