In [2]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
import spacy

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

# Load your dataset with 'text' and 'sentiment' columns
df = pd.read_csv('all-data.csv',encoding='latin-1')

df.head(5)


Unnamed: 0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [3]:
df.shape


(4845, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4845 entries, 0 to 4844
Data columns (total 2 columns):
 #   Column                                                                                                                           Non-Null Count  Dtype 
---  ------                                                                                                                           --------------  ----- 
 0   neutral                                                                                                                          4845 non-null   object
 1   According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .  4845 non-null   object
dtypes: object(2)
memory usage: 75.8+ KB


In [5]:
column_names =['sentiment','text']
df = pd.read_csv('all-data.csv', encoding='latin-1',names=column_names)
df.head(5)
     

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [6]:

df.describe()

Unnamed: 0,sentiment,text
count,4846,4846
unique,3,4838
top,neutral,TELECOMWORLDWIRE-7 April 2006-TJ Group Plc sel...
freq,2879,2


In [8]:
df.isnull().sum()


sentiment    0
text         0
dtype: int64

In [9]:
df['sentiment'].value_counts()


sentiment
neutral     2879
positive    1363
negative     604
Name: count, dtype: int64

In [10]:

import pandas as pd
from sklearn.utils import resample

majority_class = 'neutral'
minority_classes = ['positive', 'negative']

majority_df = df[df['sentiment'] == majority_class]
minority_df = df[df['sentiment'].isin(minority_classes)]

majority_downsampled = resample(
    majority_df,
    replace=False,
    n_samples=len(minority_df),
    random_state=42
)

balanced_df = pd.concat([majority_downsampled, minority_df])

In [11]:

class_sizes = balanced_df['sentiment'].value_counts()

dataset_shape = balanced_df.shape

print("Size (number of rows) of each sentiment class:")
print(class_sizes)
print("\nShape (number of rows and columns) of the balanced dataset:")
print(dataset_shape)

Size (number of rows) of each sentiment class:
sentiment
neutral     1967
positive    1363
negative     604
Name: count, dtype: int64

Shape (number of rows and columns) of the balanced dataset:
(3934, 2)


In [12]:

from sklearn.model_selection import train_test_split

X = balanced_df['text']
y = balanced_df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
X_train.shape


(3147,)

In [14]:
import spacy

nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    doc = nlp(text)

    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

    cleaned_text = ' '.join(tokens)

    return cleaned_text

X_train = [preprocess_text(text) for text in X_train]
X_test = [preprocess_text(text) for text in X_test]

In [15]:

models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

pipelines = {}
for model_name, model in models.items():
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('model', model)
    ])
    pipelines[model_name] = pipeline

In [16]:
param_grids = {
    'Logistic Regression': {
        'model__C': [0.1, 1, 10],
        'model__penalty': ['l1', 'l2']
    },
    'Decision Tree': {
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5, 10]
    },
    'Random Forest': {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5, 10]
    },
    'SVM': {
        'model__C': [0.1, 1, 10],
        'model__kernel': ['linear', 'rbf']
    }
}

best_models = {}
for model_name, pipeline in pipelines.items():
    grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [17]:

for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    print(f"Model: {model_name}\n{report}\n")
     

Model: Logistic Regression
              precision    recall  f1-score   support

    negative       0.68      0.58      0.63       131
     neutral       0.74      0.80      0.77       395
    positive       0.67      0.65      0.66       261

    accuracy                           0.71       787
   macro avg       0.70      0.68      0.69       787
weighted avg       0.71      0.71      0.71       787


Model: Decision Tree
              precision    recall  f1-score   support

    negative       0.62      0.32      0.42       131
     neutral       0.66      0.84      0.74       395
    positive       0.64      0.53      0.58       261

    accuracy                           0.65       787
   macro avg       0.64      0.57      0.58       787
weighted avg       0.65      0.65      0.64       787


Model: Random Forest
              precision    recall  f1-score   support

    negative       0.64      0.39      0.48       131
     neutral       0.73      0.86      0.79       395
    

In [18]:
X_test

['Stock Exchange Release 10/3/2011 12:00 Sanoma publish Annual Report Financial Statements 2010 Corporate Responsibility Report',
 'Christine Idzelis EQT Partners shed salcomp swedish financial investor EQT Partners sell majority stake mobile phone charger manufacturer Salcomp Oyj nordic investment company Nordstjernan AB',
 'result enable preparation conceptual plant design capital estimate',
 'slide new product release Fiskars',
 'bid offer include 1,000 share value share correspond EUR4 000',
 'phase complete end 2012',
 'follow transaction Tulikivi restructure operation Soapstone Fireplaces Business Natural Stone Product Business Ceramic Products Business unit',
 'finnish electronics manufacturing service EMS company Elcoteq SE post net loss 66.4 mln euro $ 91.2 mln half 2007 compare net profit 7.1 mln euro $ 9.8 mln period 2006',
 'sale return growth April June 2010 CEO Pekka Eloholma say',
 'comment deal Shane Lennon SVP Marketing Product Development GyPSii say',
 'representative

In [19]:
X_test_ = ["how are you"]
predicted_class = model.predict(X_test_)
predicted_class

array(['neutral'], dtype=object)

In [20]:
grid_search.best_estimator_

In [21]:
X_test_ = ["I first saw Jake Gyllenhaal in Jarhead (2005) a little while back and, since then, I've been watching every one of his movies that arrives on my radar screen. Like Clive Owen, he has an intensity (and he even resembles Owen somewhat) that just oozes from the screen. I feel sure that, if he lands some meaty roles, he'll crack an Oscar one day...<br /><br />That's not to denigrate this film at all.<br /><br />It's a fine story, with very believable people (well, it's based upon the author's early shenanigans with rocketry), a great cast Â– Chris Cooper is always good, and Laura Dern is always on my watch list Â– with the appropriate mix of humor, pathos, excitement...and the great sound track with so many rock n roll oldies to get the feet tapping.<br /><br />But, this film had a very special significance for me: in 1957, I was the same age as Homer Hickham; like him, I looked up at the night stars to watch Sputnik as it scudded across the blackness; like Homer also, I experimented with rocketry in my backyard and used even the exact same chemicals for fuel; and like Homer, I also had most of my attempts end in explosive disaster! What fun it was...<br /><br />I didn't achieve his great (metaphorical and physical) heights though. But, that's what you find out when you see this movie.<br /><br />Sure, it's a basic family movie, but that's a dying breed these days, it seems. Take the time to see it, with the kids: you'll all have a lot of good laughs."]
predicted_class = model.predict(X_test_)
predicted_class

array(['neutral'], dtype=object)