In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [36]:
df=pd.read_csv('/content/sentiment_tweets3.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10314 entries, 0 to 10313
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Index                      10314 non-null  int64 
 1   message to examine         10314 non-null  object
 2   label (depression result)  10314 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 241.9+ KB


In [37]:
df.drop(columns="Index", inplace=True)

In [38]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [39]:
port_stem=PorterStemmer()
def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#','', text)  # Remove mentions and hashtags
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and numbers
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    # Use the instance of PorterStemmer to call stem()
    text = ' '.join([port_stem.stem(word) for word in text.split() if word not in stopwords.words('english')])
    return text

In [55]:
df['message to examine']=df['message to examine'].apply(preprocess_text)

In [56]:
X=df['message to examine'].values
y=df["label (depression result)"].values

In [57]:
vectorizer=TfidfVectorizer()
X = vectorizer.fit_transform(df["message to examine"])

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [60]:
models=[LogisticRegression(max_iter=10000), SVC(), DecisionTreeClassifier(),RandomForestClassifier(), KNeighborsClassifier(n_neighbors = 1)]

In [61]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [62]:
def check_default_parameter():
    for model in models:
        # Fit the model using X_train and y_train (pandas DataFrames or Series)
        clf=model.fit(X_train, y_train)
        cross_score=cross_val_score(clf, X,y, cv=3)
        x_predict = model.predict(X_train)
        score = accuracy_score(y_train, x_predict)
        print(model)
        print(cross_score.mean())
        print(score)
        print("--------------")

In [63]:
check_default_parameter()

LogisticRegression(max_iter=10000)
0.9739189451231335
0.9860622954793359
--------------
SVC()
0.9862323056040334
0.9973336565264816
--------------
DecisionTreeClassifier()
0.9917587744812876
0.9993940128469276
--------------
RandomForestClassifier()
0.9914679076982741
0.9993940128469276
--------------
KNeighborsClassifier(n_neighbors=1)
0.8024044987395773
0.9960004847897225
--------------


In [74]:
model=SVC().fit(X_train,y_train)

In [99]:
input_string = "I feel like a failure."
input_string=preprocess_text(input_string)
input_transformed = vectorizer.transform([input_string])
prediction = model.predict(input_transformed)
print(prediction)
if prediction[0]==0:
    print("Not a depression")
else:
    print("Depression")

[0]
Not a depression
