In [1]:
from google.colab import drive
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
stock_data = '/content/drive/MyDrive/stock_data.csv'

In [4]:
df = pd.read_csv(stock_data)
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [5]:
missing_values = df.isnull().sum()
print(missing_values)

Text         0
Sentiment    0
dtype: int64


In [6]:
class_counts = df['Sentiment'].value_counts()
print(class_counts)

Sentiment
 1    3685
-1    2106
Name: count, dtype: int64


In [7]:
df['Text'][1]

'user: AAP MOVIE. 55% return for the FEA/GEED indicator just 15 trades for the year.  AWESOME.  '

In [8]:
unique_characters = pd.Series(list(''.join(df['Text']))).unique()
print(unique_characters)

['K' 'i' 'c' 'k' 'e' 'r' 's' ' ' 'o' 'n' 'm' 'y' 'w' 'a' 't' 'h' 'l' 'X'
 'I' 'D' 'E' 'T' 'S' 'O' 'Q' 'P' 'N' 'C' 'W' 'B' 'Z' 'A' 'J' 'd' '1' '2'
 ',' 'p' 'v' 'u' ':' 'M' 'V' '.' '5' '%' 'f' 'F' '/' 'G' 'j' "'" 'b' '-'
 'g' '0' '3' '7' '4' 'Y' 'H' '&' 'x' '!' '6' '+' '(' ')' '9' 'q' '~' '8'
 '#' '=' '>' '?' '_' 'z' '[' ']' '*' ';' '^' '<' '|' 'U' 'â' '€' '¦' '™'
 'R' '@' 'L' '$' 'œ' '\x9d' '”' '\n' '"' '˜' 'Â' '\xa0' '\x81' '©' '£' '…'
 '’']


In [9]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [10]:
lemmatizer = WordNetLemmatizer()

In [11]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s\$\%\#@]', '', text)
    text = re.sub(r'\b\d+\b', 'NUM', text)

    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(lemmatized_words)

In [12]:
df['Text'] = df['Text'].apply(preprocess_text)

In [13]:
df['Text'][1]

'user aap movie NUM% return for the feageed indicator just NUM trade for the year awesome'

In [14]:
le = LabelEncoder()
df['Sentiment'] = le.fit_transform(df['Sentiment'])

In [15]:
df.head()

Unnamed: 0,Text,Sentiment
0,kicker on my watchlist xide tit soq pnk cpw bp...,1
1,user aap movie NUM% return for the feageed ind...,1
2,user id be afraid to short amzn they are looki...,1
3,mnta over NUM,1
4,oi over NUM,1


In [16]:
X = df['Text']
y = df['Sentiment']

In [17]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
X_tfidf = tfidf.fit_transform(X)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [20]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

target_names = ["negative", "positive"]
report = classification_report(y_test, y_pred, target_names = target_names)

print(f"Accuracy: {accuracy}")

print(report)

Accuracy: 0.7739430543572045
              precision    recall  f1-score   support

    negative       0.79      0.53      0.63       427
    positive       0.77      0.92      0.84       732

    accuracy                           0.77      1159
   macro avg       0.78      0.72      0.73      1159
weighted avg       0.78      0.77      0.76      1159



In [None]:
log_reg_bal = LogisticRegression(max_iter=1000, class_weight = 'balanced')
log_reg_bal.fit(X_train, y_train)

y_pred = log_reg_bal.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

target_names = ["negative", "positive"]
report = classification_report(y_test, y_pred, target_names = target_names)

print(f"Accuracy: {accuracy}")

print(report)

Accuracy: 0.7868852459016393
              precision    recall  f1-score   support

    negative       0.69      0.77      0.73       427
    positive       0.86      0.80      0.83       732

    accuracy                           0.79      1159
   macro avg       0.77      0.78      0.78      1159
weighted avg       0.79      0.79      0.79      1159



In [22]:
xgb_model = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

target_names = ["negative", "positive"]
report = classification_report(y_test, y_pred, target_names=target_names)

print(f"Accuracy: {accuracy}")

print(report)

Accuracy: 0.7635893011216566
              precision    recall  f1-score   support

    negative       0.79      0.49      0.60       427
    positive       0.76      0.92      0.83       732

    accuracy                           0.76      1159
   macro avg       0.77      0.71      0.72      1159
weighted avg       0.77      0.76      0.75      1159



In [None]:
class_weights = {0: len(y) / len(y[y == 0]),
                 1: len(y) / len(y[y == 1])}

sample_weights = np.array([class_weights[i] for i in y_train])

In [None]:
xgb_model_bal = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
xgb_model_bal.fit(X_train, y_train, sample_weight=sample_weights)

y_pred = xgb_model_bal.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

target_names = ["negative", "positive"]
report = classification_report(y_test, y_pred, target_names=target_names)

print(f"Accuracy: {accuracy}")

print(report)


Accuracy: 0.7627264883520276
              precision    recall  f1-score   support

    negative       0.68      0.66      0.67       427
    positive       0.81      0.82      0.81       732

    accuracy                           0.76      1159
   macro avg       0.75      0.74      0.74      1159
weighted avg       0.76      0.76      0.76      1159

