In [1]:
from google.colab import drive
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
finance_data = '/content/drive/MyDrive/finance_data.csv'

In [4]:
df = pd.read_csv(finance_data)
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [5]:
missing_values = df.isnull().sum()
print(missing_values)

Sentence     0
Sentiment    0
dtype: int64


In [6]:
class_counts = df['Sentiment'].value_counts()
print(class_counts)

Sentiment
neutral     3130
positive    1852
negative     860
Name: count, dtype: int64


In [7]:
df['Sentence'][0]

"The GeoSolutions technology will leverage Benefon 's GPS solutions by providing Location Based Search Technology , a Communities Platform , location relevant multimedia content and a new and powerful commercial model ."

In [8]:
unique_characters = pd.Series(list(''.join(df['Sentence']))).unique()
print(unique_characters)

['T' 'h' 'e' ' ' 'G' 'o' 'S' 'l' 'u' 't' 'i' 'n' 's' 'c' 'g' 'y' 'w' 'v'
 'r' 'a' 'B' 'f' "'" 'P' 'b' 'p' 'd' 'L' ',' 'C' 'm' '.' '$' 'E' 'I' '1'
 '5' '0' '2' 'K' 'F' 'q' 'U' 'R' '3' '7' '6' 'z' '-' 'x' 'A' 'j' '4' 'k'
 'Y' 'D' 'M' 'H' 'O' 'N' 'X' '8' ':' '%' 'Q' '#' '?' '/' '9' 'V' '+' 'ñ'
 'J' '`' 'W' '@' '&' '(' ')' 'Z' 'ú' '!' '>' 'ó' 'Â' '£' '"' ';' 'à' '®'
 '¦' '=' 'ä' 'â' '€' '“' 'Ã' '¶' 'Ñ' '_' '📈' 'é' '…' '”' '^' 'á' '«' '|'
 '[' ']' '~' '{' '}' '¼' '¬' 'í' '<' 'Á' '’']


In [9]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [10]:
lemmatizer = WordNetLemmatizer()

In [11]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s\$\%\#@]', '', text)
    text = re.sub(r'\b\d+\b', 'NUM', text)

    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(lemmatized_words)

In [12]:
df['Sentence'] = df['Sentence'].apply(preprocess_text)

In [13]:
df['Sentence'][0]

'the geosolutions technology will leverage benefon s gps solution by providing location based search technology a community platform location relevant multimedia content and a new and powerful commercial model'

In [14]:
le = LabelEncoder()
df['Sentiment'] = le.fit_transform(df['Sentiment'])

In [15]:
df.head()

Unnamed: 0,Sentence,Sentiment
0,the geosolutions technology will leverage bene...,2
1,$esi on low down $NUM to $NUM bk a real possib...,0
2,for the last quarter of NUM componenta s net s...,2
3,according to the finnishrussian chamber of com...,1
4,the swedish buyout firm ha sold it remaining N...,1


In [16]:
X = df['Sentence']
y = df['Sentiment']

In [17]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
X_tfidf = tfidf.fit_transform(X)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [19]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names = le.classes_)

print(f"Accuracy: {accuracy}")

print(report)

Accuracy: 0.716852010265184
              precision    recall  f1-score   support

    negative       0.41      0.14      0.21       175
     neutral       0.72      0.90      0.80       622
    positive       0.77      0.68      0.72       372

    accuracy                           0.72      1169
   macro avg       0.63      0.57      0.58      1169
weighted avg       0.69      0.72      0.69      1169



In [20]:
log_reg_bal = LogisticRegression(max_iter=1000, class_weight='balanced')
log_reg_bal.fit(X_train, y_train)

y_pred = log_reg_bal.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names = le.classes_)

print(f"Accuracy: {accuracy}")

print(report)

Accuracy: 0.72027373823781
              precision    recall  f1-score   support

    negative       0.47      0.61      0.53       175
     neutral       0.81      0.75      0.78       622
    positive       0.74      0.73      0.73       372

    accuracy                           0.72      1169
   macro avg       0.67      0.70      0.68      1169
weighted avg       0.74      0.72      0.73      1169



In [21]:
xgb_model = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_)

print(f"Accuracy: {accuracy}")

print(report)


Accuracy: 0.6834901625320787
              precision    recall  f1-score   support

    negative       0.32      0.13      0.19       175
     neutral       0.69      0.86      0.76       622
    positive       0.75      0.65      0.70       372

    accuracy                           0.68      1169
   macro avg       0.59      0.55      0.55      1169
weighted avg       0.65      0.68      0.66      1169



In [None]:
class_weights = {0: len(y) / len(y[y == 0]),
                 1: len(y) / len(y[y == 1]),
                 2: len(y) / len(y[y == 2])}

sample_weights = np.array([class_weights[i] for i in y_train])

In [None]:
xgb_model_bal = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
xgb_model_bal.fit(X_train, y_train, sample_weight=sample_weights)

y_pred = xgb_model_bal.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_)

print(f"Accuracy: {accuracy}")

print(report)


Accuracy: 0.6817792985457656
              precision    recall  f1-score   support

    negative       0.40      0.45      0.42       175
     neutral       0.76      0.75      0.75       622
    positive       0.70      0.68      0.69       372

    accuracy                           0.68      1169
   macro avg       0.62      0.63      0.62      1169
weighted avg       0.69      0.68      0.68      1169

