In [1]:
!python -m pip install spacy
!python -m spacy download en_core_web_sm

Collecting spacy
  Using cached spacy-3.8.2-cp312-cp312-win_amd64.whl.metadata (27 kB)
Using cached spacy-3.8.2-cp312-cp312-win_amd64.whl (11.8 MB)
Installing collected packages: spacy
Successfully installed spacy-3.8.2
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -----------------------

In [1]:
import pandas as pd
import re
import spacy

In [2]:
# !mkdir data
# !wget -O "data/combined_data.csv" "https://drive.google.com/uc?export=download&id=1GJn2kEIBgto2OyD7-h2HQOv_NJUriqJh"

--2024-11-22 14:33:35--  https://drive.google.com/uc?export=download&id=1GJn2kEIBgto2OyD7-h2HQOv_NJUriqJh
Resolving drive.google.com (drive.google.com)... 142.251.211.238, 2404:6800:4009:806::200e
Connecting to drive.google.com (drive.google.com)|142.251.211.238|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1GJn2kEIBgto2OyD7-h2HQOv_NJUriqJh&export=download [following]
--2024-11-22 14:33:37--  https://drive.usercontent.google.com/download?id=1GJn2kEIBgto2OyD7-h2HQOv_NJUriqJh&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 142.250.192.33, 2a00:1450:400e:810::2001
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|142.250.192.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31469558 (30M) [application/octet-stream]
Saving to: ‘data/combined_data.csv’


2024-11-22 14:34:14 (951 KB/s) - ‘data/combined_data.csv’ save

In [2]:
data_dir = "data/combined_data.csv"
df = pd.read_csv(data_dir, index_col=0)

In [3]:
df.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53043 entries, 0 to 53042
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  52681 non-null  object
 1   status     53043 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


## Basic cleaning

In [5]:
# Check for missing values
print(f"Number of rows with missing values: {df.isnull().sum()}")

# Check for duplicates
print(f"Number of duplicate rows: {df.duplicated(subset=['statement']).sum()}")

Number of rows with missing values: statement    362
status         0
dtype: int64
Number of duplicate rows: 1969


In [6]:
# Drop rows that contain empty values
df = df.dropna() 

# Drop rows that contain duplicate values in the ‘statement’ column and keep only the first row
df = df.drop_duplicates(subset=['statement'], keep='first')

df.reset_index(drop = True, inplace = True)

In [7]:
# Recheck for missing values
print(f"Number of rows with missing values: {df.isnull().sum()}")

# Recheck for duplicates
print(f"Number of duplicate rows: {df.duplicated(subset=['statement']).sum()}")

Number of rows with missing values: statement    0
status       0
dtype: int64
Number of duplicate rows: 0


## Deep cleaning

In [8]:
# Change the data type of ‘statement’ and ‘status’ columns to string
df = df.astype({"statement":str, "status":str})

In [9]:
emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)

stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [13]:
# Load spaCy model
nlp = spacy.load("/c/Users/user/.virtualenvs/ML-Model-K74sGvU5/Lib/site-packages/en_core_web_sm/en_core_web_sm-3.x.x")

OSError: [E050] Can't find model '/c/Users/user/.virtualenvs/ML-Model-K74sGvU5/Lib/site-packages/en_core_web_sm/en_core_web_sm-3.x.x'. It doesn't seem to be a Python package or a valid path to a data directory.

In [14]:
!which python

'which' is not recognized as an internal or external command,
operable program or batch file.


In [12]:
def clean_text(text):
    text = text.lower()

    # Hapus punctuation marks
    text = re.sub(r'[^\w\s]', '', text)  
    
    # Hapus angka
    text = re.sub(r'\d+', '', text)  
    
    # Hapus emoji (Unicode Range)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    # Hapus karakter berulang
    text = re.sub(r'(.)\1+', r'\1', text)
    
    # Hapus karakter tunggal (misalnya huruf yang berdiri sendiri)
    text = re.sub(r'\b\w\b', '', text)
    
    # Hapus spasi ganda atau lebih
    text = re.sub(r'\s+', ' ', text)

    doc = nlp(text)
    cleaned_text = []
    for token in doc:
        # remove stopwords
        if token.is_stop:
            continue
        # replace verb with its lemma
        elif token.pos_ == "VERB":
            cleaned_text.append(token.lemma_)
        else:
            cleaned_text.append(token.text)

    return text.strip()

In [26]:
# CLEAN!!!
df['statement'] = df['statement'].apply(clean_text)
df = df[df['statement'] != ""]

## Very deep cleaning

In [27]:
# Data distribution analysis of each label
df.status.value_counts()

status
Normal                  15973
Depression              15086
Suicidal                10639
Anxiety                  3617
Bipolar                  2501
Stress                   2293
Personality disorder      895
Name: count, dtype: int64

In [28]:
# Adding word count column for further analysis
df['word_count'] = df['statement'].apply(lambda x: len(x.split()))

In [29]:
# Define bins and labels for word count ranges
bins = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, float('inf')]  # Adjust as needed
labels = ['1-100', '101-200', '201-300', '301-400', '401-500', '501-600', '601-700', '701-800', '801-900', '901-1000', '+1000']

# Add a column to categorize statements into ranges
df['word_count_range'] = pd.cut(df['word_count'], bins=bins, labels=labels, right=True)

In [49]:
# Count the number of statements in each range
df['word_count_range'].value_counts().sort_index()

word_count_range
1-100       42531
101-200      6121
201-300      1510
301-400       474
401-500       198
501-600        78
601-700        39
701-800        22
801-900        10
901-1000        6
+1000          15
Name: count, dtype: int64

In [50]:
# Group by word count range and label, then count occurrences
df.groupby(['word_count_range', 'status']).size().unstack(fill_value=0)

  df.groupby(['word_count_range', 'status']).size().unstack(fill_value=0)


status,Anxiety,Bipolar,Depression,Normal,Personality disorder,Stress,Suicidal
word_count_range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1-100,2705,1677,11106,15970,589,2053,8431
101-200,672,592,2849,3,232,178,1595
201-300,159,162,720,0,47,45,377
301-400,50,41,222,0,17,9,135
401-500,21,21,84,0,7,6,59
501-600,5,4,48,0,2,0,19
601-700,2,1,23,0,0,1,12
701-800,3,2,13,0,0,0,4
801-900,0,0,8,0,0,1,1
901-1000,0,0,4,0,0,0,2


In [55]:
df_export_candidate = df[(df['word_count'] >= 10) & (df['word_count'] <= 500)].reset_index(drop=True)
df_export_candidate.status.value_counts()

status
Depression              14160
Suicidal                 9700
Normal                   5204
Anxiety                  3142
Bipolar                  2464
Stress                   2271
Personality disorder      868
Name: count, dtype: int64

In [56]:
# Count the number of examples for each label
label_counts = df_export_candidate['status'].value_counts()

# Find the label with the minimum count
min_label = label_counts.idxmin()
min_count = label_counts.min()

print(f"Label with the lowest number of examples: {min_label}")
print(f"Number of examples: {min_count}")

Label with the lowest number of examples: Personality disorder
Number of examples: 868


In [57]:
df_export_candidate = df_export_candidate.sort_values(by='word_count', ascending=False)
df_export_candidate = df_export_candidate.groupby('status').head(min_count)
df_export_candidate.reset_index(drop=True, inplace=True)
df_export_candidate

Unnamed: 0,statement,status,word_count,word_count_range
0,live two people family schizophrenia butcher k...,Anxiety,498,401-500
1,anxie tips not really just story anxiety im no...,Anxiety,498,401-500
2,reflecting life lot lately always find going b...,Depression,498,401-500
3,call scapegoat personality father one murdered...,Suicidal,497,401-500
4,since can remember always sad child inside bla...,Suicidal,497,401-500
...,...,...,...,...
6071,ve feeling miserable hopeless weekend hate jus...,Personality disorder,10,1-100
6072,covid view poll https www reddit com poll k kb,Personality disorder,10,1-100
6073,read books view poll https www reddit com poll...,Personality disorder,10,1-100
6074,philosophy interests view poll https www reddi...,Personality disorder,10,1-100


In [58]:
df_export_candidate.status.value_counts()

status
Anxiety                 868
Depression              868
Suicidal                868
Personality disorder    868
Stress                  868
Bipolar                 868
Normal                  868
Name: count, dtype: int64

In [59]:
df_export_candidate.drop(['word_count', 'word_count_range'], axis=1, inplace=True)

In [60]:
df_export_candidate = df_export_candidate.sample(frac=1).reset_index(drop=True)

In [62]:
df_export_candidate.to_csv('data/cleaned_data.csv', index=False)