# ------------------------------------Imports---------------------------------------

In [6]:
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

nltk.download('stopwords')
nltk.download('wordnet')

from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\haykel.bargougui\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\haykel.bargougui\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# ---------------------------------data_prerocessing--------------------------------

In [7]:
dataset1 = pd.read_excel("Copy-of-Data_Verif-sentiment_analysis_data.xlsx")
dataset2 = pd.read_excel("verifNeutral.xlsx")

In [8]:
print(dataset1.shape)
print(dataset2.shape)

(105182, 7)
(19130, 4)


In [9]:
print(dataset1['Paragraph'].dtype)
print(dataset2['Paragraph'].dtype)

object
object


In [10]:
df1=dataset1.head(15016)
df2=dataset2.head(4567)

In [11]:
# Concaténer les datasets
concatenated = pd.concat([df1, df2])

# Mélanger les lignes 
X = concatenated.sample(frac=1, random_state=42)  # Mélanger les lignes

# Réinitialiser les index si nécessaire
X = X.reset_index(drop=True)

# Afficher le dataframe désordonné
X

Unnamed: 0,Paragraph,Company,Sentiment,Sentiment_Score,Unnamed: 4,Unnamed: 5,Topic,DetectedCompany
0,St. Clair County and the State of Alabama are ...,Eissmann Automotive Group,positive,0.951278,,,SOCIAL,
1,Anti Corruption Principle 10 Businesses should...,Global,neutral,,,,,
2,There are still hundreds of employees working ...,Twitter,neutral,,,,,
3,Moomoo Financial Singapore Pte. Ltd moomoo SG...,Moomoo Financial Singapore Pte. Ltd,positive,0.968842,,,GOVERNANCE,
4,"If they cant see the ROI in their investment ,...",,positive,0.931044,,,SOCIAL,
...,...,...,...,...,...,...,...,...
19578,This is just one more step in getting there an...,METRO,positive,0.908480,,,SOCIAL,
19579,His expertise at advancing strategic initiativ...,,positive,0.774756,,,SOCIAL,
19580,"To that end , my Administration will advance e...",,positive,0.877340,,,GOVERNANCE,
19581,There comes a time in a companys evolution tha...,Green Thumb Industries,negative,0.107648,,,GOVERNANCE,


In [12]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19583 entries, 0 to 19582
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Paragraph        19582 non-null  object 
 1   Company          12718 non-null  object 
 2   Sentiment        19583 non-null  object 
 3   Sentiment_Score  15016 non-null  float64
 4   Unnamed: 4       0 non-null      float64
 5   Unnamed: 5       0 non-null      float64
 6   Topic            15016 non-null  object 
 7   DetectedCompany  900 non-null    object 
dtypes: float64(3), object(5)
memory usage: 1.2+ MB


In [13]:
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Sentiment_Score,15016.0,0.636513,0.385054,0.015197,0.124842,0.867851,0.933071,0.983873
Unnamed: 4,0.0,,,,,,,
Unnamed: 5,0.0,,,,,,,


In [14]:
X['Paragraph'] = X['Paragraph'].astype(str)

In [15]:
X['Tokens'] = X['Paragraph'].apply(word_tokenize)

In [16]:
stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

def preprocess_text(tokens):
    tokens = [word.lower() for word in tokens if word.isalpha()]  # Suppression de la ponctuation et mise en minuscule
    tokens = [word for word in tokens if word not in stop_words]  # Suppression des mots vides
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatisation
    return tokens


X['Tokens'] = X['Tokens'].apply(preprocess_text)

In [17]:
x = X['Tokens']
y = X['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Machine Learning 

In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
from sklearn.model_selection import cross_val_score

def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5)).mean()
    return rmse
    

def evaluation(y, predictions):
    mae = mean_absolute_error(y, predictions)
    mse = mean_squared_error(y, predictions)
    rmse = np.sqrt(mean_squared_error(y, predictions))
    r_squared = r2_score(y, predictions)
    return mae, mse, rmse, r_squared

# MultinomialNB : 

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

In [20]:
pipeline.fit(X_train.apply(' '.join), y_train)

In [21]:
y_pred = pipeline.predict(X_test.apply(' '.join))

In [22]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# evaluation(y_test, y_pred)

Accuracy: 0.7015573142711259


In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfidf', CountVectorizer()),
    ('classifier', MultinomialNB())
])

In [24]:
pipeline.fit(X_train.apply(' '.join), y_train)

In [25]:
y_pred = pipeline.predict(X_test.apply(' '.join))

In [26]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7988256318611182


# RandomForestClassifier :

### TfidfVectorizer :

In [27]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)


pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', rf_classifier)
])

In [28]:
pipeline.fit(X_train.apply(' '.join), y_train)

In [29]:
y_pred = pipeline.predict(X_test.apply(' '.join))

In [30]:
from sklearn.metrics import accuracy_score

# Calculer l'accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (TF-IDF):", accuracy)

# Calculer le score R2 (pour la régression)
# r2 = r2_score(y_test, y_pred)  # Utilisez ceci pour un modèle de régression
# print("R2 Score:", r2)

Accuracy (TF-IDF): 0.7829971917283636


### CountVectorizer :

In [31]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

from sklearn.feature_extraction.text import CountVectorizer


pipeline = Pipeline([
    ('count_vectorizer', CountVectorizer()),
    ('classifier', rf_classifier)
])

In [32]:
pipeline.fit(X_train.apply(' '.join), y_train)

In [33]:
y_pred = pipeline.predict(X_test.apply(' '.join))

In [34]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (CountVectorizer):", accuracy)

Accuracy (CountVectorizer): 0.7911667092162369


# Xgboost

In [35]:
!pip install xgboost



  return process_handler(cmd, _system_body)
  return process_handler(cmd, _system_body)
  return process_handler(cmd, _system_body)


In [36]:
 y_test = y_test.replace("postive", "positive")

In [37]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
label_encoder = LabelEncoder()

# Fit the label encoder on the target labels and transform them
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [38]:
# Create the XGBoost classifier
from xgboost import XGBClassifier


xgb_classifier = XGBClassifier(n_estimators=100, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer


# Define the pipeline
pipeline = Pipeline([
    ('tfidf_vectorizer', TfidfVectorizer()),
    ('classifier', xgb_classifier)
])


In [39]:
# Fit the pipeline on training data
pipeline.fit(X_train.apply(' '.join), y_train_encoded)

In [40]:
# Predict using the pipeline
y_pred_encoded = pipeline.predict(X_test.apply(' '.join))

# Inverse transform the predicted labels to get the original string labels
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (TF-IDF + XGBoost):", accuracy)

Accuracy (TF-IDF + XGBoost): 0.7819760020423794


In [41]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Create a label encoder object
label_encoder = LabelEncoder()

# Fit the label encoder on the target labels and transform them
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Create the XGBoost classifier
xgb_classifier = XGBClassifier(n_estimators=100, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('count_vectorizer', CountVectorizer()),
    ('classifier', xgb_classifier)
])

# Fit the pipeline on training data
pipeline.fit(X_train.apply(' '.join), y_train_encoded)

# Predict using the pipeline
y_pred_encoded = pipeline.predict(X_test.apply(' '.join))

# Inverse transform the predicted labels to get the original string labels
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (CountVectorizer + XGBoost):", accuracy)


Accuracy (CountVectorizer + XGBoost): 0.7812101097778913


In [42]:
from sklearn.svm import SVC

svm_classifier = SVC(kernel='linear', random_state=42)

pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', svm_classifier)
])

pipeline_svm.fit(X_train.apply(' '.join), y_train)
y_pred_svm = pipeline_svm.predict(X_test.apply(' '.join))

accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Accuracy (TF-IDF + SVM):", accuracy_svm)


Accuracy (TF-IDF + SVM): 0.8320142966556038


In [43]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Create SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)

# Create pipeline with CountVectorizer
pipeline_svm_count = Pipeline([
    ('count_vectorizer', CountVectorizer()),
    ('classifier', svm_classifier)
])

# Fit the pipeline on training data
pipeline_svm_count.fit(X_train.apply(' '.join), y_train)

# Predict using the pipeline
y_pred_svm_count = pipeline_svm_count.predict(X_test.apply(' '.join))

# Calculate accuracy
accuracy_svm_count = accuracy_score(y_test, y_pred_svm_count)
print("Accuracy (CountVectorizer + SVM):", accuracy_svm_count)

Accuracy (CountVectorizer + SVM): 0.809548123563952


In [44]:
pip install lightgbm


Note: you may need to restart the kernel to use updated packages.


  return process_handler(cmd, _system_body)
  return process_handler(cmd, _system_body)
  return process_handler(cmd, _system_body)


# Deaplearning : 