In [None]:
import pandas as pd
import numpy as np
import os
import glob
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Preprocessing

In [None]:
# Define the path to the data
data_path = 'data_face'

# List all files in the data directory
all_files = glob.glob(os.path.join(data_path, '*'))

# Function to load data from text files
def load_text_file(file_path):
    return pd.read_csv(file_path, delim_whitespace=True)

# Function to load data from binary HOG files
def load_hog_file(file_path):
    with open(file_path, 'rb') as f:
        data = np.fromfile(f, dtype=np.float32)
    return data

# Dictionary to store data for each person
data = {}

# Load each type of file
for file_path in all_files:
    file_name = os.path.basename(file_path)
    person_id = file_name.split('_')[0]
    
    if person_id not in data:
        data[person_id] = {}
    
    if file_name.endswith('.txt'):
        data[person_id][file_name] = load_text_file(file_path)
    elif file_name.endswith('.bin'):
        data[person_id][file_name] = load_hog_file(file_path)

# Function to preprocess text data
def preprocess_text_data(df):
    # Impute missing values with mean
    imputer = SimpleImputer(strategy='mean')
    df_imputed = imputer.fit_transform(df)
    
    # Standardize the data
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_imputed)
    
    return df_scaled

# Preprocess the data for each person
for person_id, person_data in data.items():
    for file_name, df in person_data.items():
        if isinstance(df, pd.DataFrame):
            data[person_id][file_name] = preprocess_text_data(df)

# Example of accessing preprocessed data for a specific person
person_id = '301'
print(data[person_id].keys())


# Exploratory data analysis

In [None]:
# Example: Visualize the distribution of a specific feature
def plot_feature_distribution(data, feature_name):
    feature_values = []
    for person_id, person_data in data.items():
        for file_name, df in person_data.items():
            if isinstance(df, np.ndarray):  # Only for numpy arrays (text data)
                feature_values.append(df[:, feature_name])
    
    feature_values = np.concatenate(feature_values)
    sns.histplot(feature_values, kde=True)
    plt.title(f'Distribution of {feature_name}')
    plt.xlabel(feature_name)
    plt.ylabel('Frequency')
    plt.show()

# Plotting a feature (adjust feature_name as needed)
plot_feature_distribution(data, 0)  # Example for the first feature


# Train test split

In [None]:
# Function to extract features
def extract_features(person_data):
    features = []
    
    for file_name, df in person_data.items():
        if isinstance(df, np.ndarray):  # HOG data
            features.append(df)
        elif isinstance(df, np.ndarray):  # Other preprocessed text data
            features.append(df.flatten())
    
    return np.concatenate(features)

# Create feature vectors for each person
feature_vectors = []
labels = []  # Assuming you have labels indicating depression or not

for person_id, person_data in data.items():
    features = extract_features(person_data)
    feature_vectors.append(features)
    # Append the label for this person
    labels.append(0)  # Replace with actual label

X = np.array(feature_vectors)
y = np.array(labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Creating the pipeline & hyperparameter tuning

In [None]:
# Create a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

# Define hyperparameters to tune
param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear']
}

# Create a GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters and best score
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_}')


# Evaluation

In [None]:
# Predict on the test set
y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
