<a href="https://colab.research.google.com/github/Meshal6299/multilingual-movie-reviews-NLP/blob/main/notebooks/03_sentiment_ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pickle
import spacy

In [2]:
!python -m spacy download en_core_web_sm
!python -m spacy download es_core_news_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting es-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')


In [3]:
# Load spaCy models
nlp_en = spacy.load("en_core_web_sm")
nlp_es = spacy.load("es_core_news_sm")

In [4]:
# Load the cleaned datasets
eng = pd.read_csv("../data/processed/01_cleaned_imdb_en.csv")
spa = pd.read_csv("../data/processed/01_cleaned_imdb_es.csv")

print("Datasets loaded successfully!")
print(eng.head(), "\n")
print(spa.head())

Datasets loaded successfully!
                                          clean_text sentiment
0  starts really well nice intro and build up for...  negative
1  terrific movie if you did not watch yet you mu...  positive
2  i have seen hundreds of silent movies some wil...  positive
3  i had been looking for this film for so long b...  positive
4  good engaging cinematic firefights great prese...  positive 

                                          clean_text sentiment
0  comienza muy bien bonita intro y acumule para ...  negativo
1  película excelente si aún no lo observaste deb...  positivo
2  he visto cientos de películas silenciosas algu...  positivo
3  había estado buscando esta película durante ta...  positivo
4  bueno atractiva firefights cinematicales una g...  positivo


In [5]:
# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit English vectorizer on English only
X_en = vectorizer.fit_transform(eng["clean_text"])
y_en = eng["sentiment"]

# Fit new Spanish vectorizer on Spanish only
vectorizer_es = TfidfVectorizer(max_features=5000)
X_es = vectorizer_es.fit_transform(spa["clean_text"])
y_es = spa["sentiment"]

print("TF-IDF vectors created!")

TF-IDF vectors created!


In [6]:
X_train_en, X_test_en, y_train_en, y_test_en = train_test_split(
    X_en, y_en, test_size=0.2, random_state=42
)

X_train_es, X_test_es, y_train_es, y_test_es = train_test_split(
    X_es, y_es, test_size=0.2, random_state=42
)

model_en = LogisticRegression(max_iter=2000)
model_es = LogisticRegression(max_iter=2000)

model_en.fit(X_train_en, y_train_en)
model_es.fit(X_train_es, y_train_es)

print("Sentiment models trained!")

Sentiment models trained!


In [7]:
pred_en = model_en.predict(X_test_en)
pred_es = model_es.predict(X_test_es)

print("\n=== ENGLISH SENTIMENT CLASSIFICATION ===\n")
print(classification_report(y_test_en, pred_en))

print("\n=== SPANISH SENTIMENT CLASSIFICATION ===\n")
print(classification_report(y_test_es, pred_es))

# Store summarized numbers
results = pd.DataFrame({
    "Language": ["English", "Spanish"],
    "Accuracy": [
        accuracy_score(y_test_en, pred_en),
        accuracy_score(y_test_es, pred_es)
    ],
    "F1-score": [
        f1_score(y_test_en, pred_en, average="weighted"),
        f1_score(y_test_es, pred_es, average="weighted")
    ]
})

results


=== ENGLISH SENTIMENT CLASSIFICATION ===

              precision    recall  f1-score   support

    negative       0.87      0.86      0.86       987
    positive       0.86      0.87      0.87      1013

    accuracy                           0.86      2000
   macro avg       0.86      0.86      0.86      2000
weighted avg       0.86      0.86      0.86      2000


=== SPANISH SENTIMENT CLASSIFICATION ===

              precision    recall  f1-score   support

    negativo       0.85      0.85      0.85       987
    positivo       0.85      0.86      0.85      1013

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000



Unnamed: 0,Language,Accuracy,F1-score
0,English,0.8645,0.864483
1,Spanish,0.852,0.851993


In [8]:
pickle.dump(model_en, open("../src/sentiment_en.pkl", "wb"))
pickle.dump(model_es, open("../src/sentiment_es.pkl", "wb"))

pickle.dump(vectorizer, open("../src/vectorizer_en.pkl", "wb"))
pickle.dump(vectorizer_es, open("../src/vectorizer_es.pkl", "wb"))

print("Models saved!")

Models saved!


In [9]:
def extract_ner(doc):
    return [(ent.text, ent.label_) for ent in doc.ents]

eng["entities"] = eng["clean_text"].apply(lambda x: extract_ner(nlp_en(x)))
spa["entities"] = spa["clean_text"].apply(lambda x: extract_ner(nlp_es(x)))

print("NER extraction complete!")

NER extraction complete!


In [10]:
print(eng[["clean_text", "entities"]].head())
print(spa[["clean_text", "entities"]].head())

                                          clean_text  \
0  starts really well nice intro and build up for...   
1  terrific movie if you did not watch yet you mu...   
2  i have seen hundreds of silent movies some wil...   
3  i had been looking for this film for so long b...   
4  good engaging cinematic firefights great prese...   

                                            entities  
0  [(about minutes, TIME), (american, NORP), (kea...  
1  [(geena davis, PERSON), (samuel l jackson, PER...  
2  [(hundreds, CARDINAL), (william randolph, PERS...  
3                                [(second, ORDINAL)]  
4                                 [(third, ORDINAL)]  
                                          clean_text  \
0  comienza muy bien bonita intro y acumule para ...   
1  película excelente si aún no lo observaste deb...   
2  he visto cientos de películas silenciosas algu...   
3  había estado buscando esta película durante ta...   
4  bueno atractiva firefights cinematicales una g... 

In [12]:
eng.to_csv("../data/processed/03_ner_english.csv", index=False)
spa.to_csv("../data/processed/03_ner_spanish.csv", index=False)

print("NER datasets saved!")

NER datasets saved!
