In [29]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [30]:
# Load the datasets
try:
    train_df = pd.read_csv('../data/train.csv')
    test_df = pd.read_csv('../data/test.csv')
except FileNotFoundError:
    print("Make sure 'train.csv' and 'test.csv' are in the same directory.")
    exit()


print("Training data shape:", train_df.shape)
print("Test data shape:", test_df.shape)
print("\nFirst 5 rows of training data:")
print(train_df.head())

Training data shape: (37130, 3)
Test data shape: (15913, 2)

First 5 rows of training data:
      id                                          statement      status
0  47647  life doesn’t feel worth it that’s kind of it? ...  Depression
1  21689  This life sucks and if it were for my religiou...  Depression
2  28246  its been 9 months now for our marriage and she...      Stress
3  22363  I do not feel particularly sad or anxious or a...  Depression
4  13362  I am taking Venlafaxine. it is an SSRI. does n...  Depression


In [31]:
train_df['statement'].fillna('', inplace=True)
test_df['statement'].fillna('', inplace=True)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_df['cleaned_statement'] = train_df['statement'].apply(clean_text)
test_df['cleaned_statement'] = test_df['statement'].apply(clean_text)

print("\nExample of cleaned text:")
print("Original:", train_df['statement'].iloc[5])
print("Cleaned: ", train_df['cleaned_statement'].iloc[5])

label_encoder = LabelEncoder()
train_df['status_encoded'] = label_encoder.fit_transform(train_df['status'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['statement'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['statement'].fillna('', inplace=True)



Example of cleaned text:
Original: I (17M) have come to the realisation that My dad is a prick. Ever since i was little he never really cared about me, he saw me as more of an extension of himself. it is always been about getting a good carreer and getting an education. My granddad is the same. Until age 12 I thought that taking an extra education after school was a law. I legit thought that I had to get into a Gymnasium or something like wise, by law.When I was about 6 years old, I could not sleep one night. So as a normal kid, I went into My parents room and said: "I cannot sleep" My mom was ready to tuck me in, but My dad said: "You say that all the time, and you always fall asleep anyway." To this day that memory is still crystal clear in my mind. My granddad calls me fat everytime we visit him, and always ask if I know what gymnasium I am going to next year, or what job I want in the future.Time and time again my dad's side of the family has showed to me that they cannot be trust

In [32]:
tfidf_vectorizer = TfidfVectorizer(max_features=20000, stop_words='english')

X_train = tfidf_vectorizer.fit_transform(train_df['cleaned_statement'])

X_test = tfidf_vectorizer.transform(test_df['cleaned_statement'])

y_train = train_df['status_encoded']

print("\nShape of TF-IDF matrices:")
print("Training features:", X_train.shape)
print("Test features:", X_test.shape)


Shape of TF-IDF matrices:
Training features: (37130, 20000)
Test features: (15913, 20000)


In [33]:
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train,
    test_size=0.2,
    random_state=42,
    stratify=y_train
)

print("--- Model Evaluation ---")
print(f"New training set size: {X_train_split.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")

--- Model Evaluation ---
New training set size: 29704
Validation set size: 7426


In [34]:
print("\nTraining the model...")
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_split, y_train_split)
print("Model training complete! ✅")

print("Making predictions on the validation set...")
y_pred_val = model.predict(X_val)

accuracy = accuracy_score(y_val, y_pred_val)
print(f"Validation Accuracy: {accuracy:.4f}")


Training the model...
Model training complete! ✅
Making predictions on the validation set...
Validation Accuracy: 0.7482


In [35]:
print("\nTraining the model...")
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)
print("Model training complete! ✅")


Training the model...
Model training complete! ✅


In [36]:
print("\nMaking predictions on the test set...")
predictions_encoded = model.predict(X_test)

predictions = label_encoder.inverse_transform(predictions_encoded)

submission_df = pd.DataFrame({
    'id': test_df['id'],
    'status': predictions
})

submission_df.to_csv('../data/submission.csv', index=False)

print("\nSubmission file 'submission.csv' created successfully! 🎉")
print("\nFirst 5 rows of submission file:")
print(submission_df.head())


Making predictions on the test set...

Submission file 'submission.csv' created successfully! 🎉

First 5 rows of submission file:
      id      status
0  10203    Suicidal
1  51476      Stress
2  31460      Normal
3  33980  Depression
4   4200      Normal
