In [None]:
import os
import xml.etree.ElementTree as et

import numpy as np
import pandas as pd

# Step 1: Traverse the directory structure
path = r'C:\Users\User\Machine_Learning_Projects\Depression_Analysis\2017'

train_folder = os.path.join(path, 'train')
test_folder = os.path.join(path, 'test')

# Step 2-4: Read XML files and load data into an appropriate format
def process_xml_files(folder_path, label):
    data = []

    for chunk_folder in os.listdir(folder_path):
        chunk_folder_path = os.path.join(folder_path, chunk_folder)
        if not os.path.isdir(chunk_folder_path):
            continue

        for xml_file in os.listdir(chunk_folder_path):
            xml_file_path = os.path.join(chunk_folder_path, xml_file)
            if not xml_file.endswith('.xml'):
                continue

            # Step 3: Extract data from XML files
            tree = et.parse(xml_file_path)
            root = tree.getroot()

            # Extract the relevant data from the XML structure
            # Modify this part based on the XML structure of your dataset
            for writing in root.findall('WRITING'):
                text = writing.find('TEXT').text.strip()


                # Step 4: Load the data into an appropriate format
                data.append({'text': text, 'label': label})

    return data

# Process the train dataset
train_data_depression = process_xml_files(os.path.join(train_folder, 'positive_examples_anonymous_chunks'), label='depression')
train_data_non_depression = process_xml_files(os.path.join(train_folder, 'negative_examples_anonymous_chunks'), label='non-depression')

# Combining the train datasets into a single DataFrame
train_df = pd.DataFrame(train_data_depression + train_data_non_depression)


In [None]:
train_df

In [None]:
import gensim
train_df.text = train_df.text.apply(gensim.utils.simple_preprocess)


In [None]:
model = gensim.models.Word2Vec(
    window=5,
    min_count = 2,
    workers=4
)

In [None]:
model.build_vocab(train_df.text, progress_per=1000)

In [None]:
model.train(train_df.text, total_examples=model.corpus_count, epochs=5)

In [None]:
model.save('exp.model')

In [None]:
import gensim

In [None]:
model = gensim.models.Word2Vec.load('exp.model')

In [None]:
train_df['label'].value_counts()

In [None]:
train_df['label'] = train_df['label'].map({
    'non-depression':0,
    'depression':1
})

In [None]:
train_df.label.value_counts()

In [None]:
train_df.text

In [None]:
import numpy as np


def buildvector(text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in text:
        try:
            vec += model.wv[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec


from sklearn.preprocessing import scale

vectors = np.concatenate([buildvector(x, model.wv.vector_size) for x in train_df.text])
vectors = scale(vectors)

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(vectors, train_df.label, test_size=0.2, random_state=348)

In [None]:
X_train

In [None]:
Y_train

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# Create a Logistic Regression model
lr_model = LogisticRegression()

# Train the model
lr_model.fit(X_train, Y_train)

# Make predictions on the test set
y_pred = lr_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
precision = precision_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
print("Logistic Regression accuracy:", accuracy)
print("Logistic Regression recall score:", recall)
print("Logistic Regression precison score:", precision)
print("Logistic Regression f1 score :", f1)


In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Standardize the feature vectors
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Perform oversampling using SMOTE
oversampler = SMOTE(random_state=342)
X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train_scaled, Y_train)

# Perform undersampling using RandomUnderSampler
undersampler = RandomUnderSampler(random_state=342)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_oversampled, y_train_oversampled)

# Train logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

# Predict on the test set
y_pred = logreg.predict(X_test)

# Evaluate the model
print(classification_report(Y_test, y_pred))

In [None]:
#Evaluation of the Model using ROC Curve
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Predicting probabilities on the test set
probs = logreg.predict_proba(X_test)
probs_positive = probs[:, 1]  # Considering only positive class probabilities

# Calculating the false positive rate, true positive rate, and thresholds
fpr, tpr, thresholds = roc_curve(Y_test, probs_positive)

# Calculate the Area Under the ROC Curve (AUC)
roc_auc = auc(fpr, tpr)

# Plotting the ROC curve
plt.plot(fpr, tpr, label='ROC curve (AUC = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'r--')  # Plotting the random classifier
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
# Evaluating the Peformance using Confusion Matrix
import seaborn as sns
from sklearn.metrics import confusion_matrix

Y_pred = logreg.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)

# Creating a heatmap for the confusion matrix
labels = np.unique(Y_test)
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix Plot')

plt.show()

In [None]:
## UNDER CONSTRUCTION ##
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidfmodel = TfidfVectorizer()
# inputs = tfidfmodel.fit_transform(df['text_without_stopwords'])
# print(inputs)
# tfidfmodel.vocabulary_
# from sklearn.model_selection import train_test_split
# X_train, X_test, Y_train, Y_test = train_test_split(inputs, df.label, test_size=0.2, random_state=48)
# X_train.shape
# Y_train.shape
# from sklearn.linear_model import LogisticRegression
# lr_model = LogisticRegression()
# lr_model.fit(X_train, Y_train)
# from sklearn.metrics import accuracy_score, recall_score
# predictions = lr_model.predict(X_test)
# accuracy = accuracy_score(Y_test, predictions)
# recall = recall_score(Y_test, predictions)
# print(accuracy)
# print(recall)

In [None]:
# # Converting text data to Word2Vec embeddings
# X_vectors = []
# for items in df.lemmatized_text:
#     vectors = [word2vec_model.wv[word] for word in items if word in word2vec_model.wv]
#     if vectors:
#         X_vectors.append(np.mean(vectors, axis=0)) # This will result in a single vector representation for the whole sentence. Each sentence will now be represented as a vector of size 100. (vector size =100 by default)
# #Splitting the dataset
# X_array = np.array(X_vectors)
# X_train, X_test, Y_train, Y_test = train_test_split(X_array, df.label, test_size=0.2, random_state=48)
# X_train.shape
# Y_train.shape
# X_test.shape
# Y_test.shape
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
#
# # Create a Logistic Regression model
# lr_model = LogisticRegression()
#
# # Train the model
# lr_model.fit(X_train, Y_train)
#
# # Make predictions on the test set
# y_pred = lr_model.predict(X_test)
#
# # Calculate accuracy
# accuracy = accuracy_score(Y_test, y_pred)
# recall = recall_score(Y_test, y_pred)
# precision = precision_score(Y_test, y_pred)
# f1 = f1_score(Y_test, y_pred)
# print("Logistic Regression accuracy:", accuracy)
# print("Logistic Regression recall score:", recall)
# print("Logistic Regression precison score:", precision)
# print("Logistic Regression f1 score :", f1)