In [2]:
import subprocess
import sys

# List of required packages
required_packages = [
    "pickle", "pandas", "sklearn", "numpy"
]

def install_package(package):
    """Installs a package using pip if it's not already installed."""
    try:
        __import__(package)
        print(f"{package} is already installed.")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Check and install missing packages
for package in required_packages:
    install_package(package)

pickle is already installed.
pandas is already installed.
sklearn is already installed.
numpy is already installed.


In [3]:
import pickle
import pandas as pd
import numpy as np

In [4]:
file_path = "../Dataset/others_reviews.pkl"

with open(file_path, "rb") as f:
    others_reviews_df = pickle.load(f)

# Visualizza le prime righe
display(others_reviews_df.head())


Unnamed: 0,Review_ID,Movie_ID,Movie_Title,Rating,Review_Date,Review_Title,Review_Text,Helpful_Votes,Total_Votes
0,9637661,tt6751668,Parasite,5.0,23 February 2024,"Solid Film Craftsmanship, Trash Story",I'm genuinely baffled this film won not only b...,3.0,8.0
1,5510542,tt6751668,Parasite,10.0,26 February 2020,MASTERPIECE,Just watch it. It has everything; entertainmen...,3.0,5.0
2,5182892,tt6751668,Parasite,10.0,12 October 2019,First Hit: I really enjoyed this story as it d...,First Hit: I really enjoyed this story as it d...,24.0,40.0
3,5499682,tt6751668,Parasite,9.0,21 February 2020,If you love cliché stories this movie is not f...,I was not expecting that much of this movie. N...,2.0,5.0
4,6094155,tt6751668,Parasite,8.0,14 September 2020,Amazing.,"Good acting, cinematography, twists and screen...",0.0,0.0


In [5]:
file_path = "../Dataset/sw_reviews.pkl"

with open(file_path, "rb") as f:
    sw_reviews_df = pickle.load(f)

# Visualizza le prime righe
display(sw_reviews_df.head())


Unnamed: 0,Review_ID,Movie_ID,Movie_Title,Rating,Review_Date,Review_Title,Review_Text,Helpful_Votes,Total_Votes
0,2221293,tt0076759,Star Wars: Episode IV - A New Hope,,15 March 2010,Impossible to watch with fresh eyes,It was a long time ago when I first saw Star W...,0.0,0.0
1,4756672,tt0076759,Star Wars: Episode IV - A New Hope,10.0,1 April 2019,It's Still Just Star Wars to Me,While I will acknowledge its faults this is st...,0.0,0.0
2,156096,tt0076759,Star Wars: Episode IV - A New Hope,10.0,19 January 1999,A modern myth that can't be beat,Star Wars is a modern myth that has a story li...,0.0,0.0
3,155657,tt0076759,Star Wars: Episode IV - A New Hope,,28 August 1999,There is a God and his name is George Lucas,I saw for the first time when I was six years ...,0.0,0.0
4,155649,tt0076759,Star Wars: Episode IV - A New Hope,1.0,31 August 1999,Good but over-rated.,"Frankly, I think ""Star wars"" is a great movie....",7.0,53.0


In [6]:
complete_df = pd.concat([sw_reviews_df, others_reviews_df], ignore_index=True)
complete_df.head()


Unnamed: 0,Review_ID,Movie_ID,Movie_Title,Rating,Review_Date,Review_Title,Review_Text,Helpful_Votes,Total_Votes
0,2221293,tt0076759,Star Wars: Episode IV - A New Hope,,15 March 2010,Impossible to watch with fresh eyes,It was a long time ago when I first saw Star W...,0.0,0.0
1,4756672,tt0076759,Star Wars: Episode IV - A New Hope,10.0,1 April 2019,It's Still Just Star Wars to Me,While I will acknowledge its faults this is st...,0.0,0.0
2,156096,tt0076759,Star Wars: Episode IV - A New Hope,10.0,19 January 1999,A modern myth that can't be beat,Star Wars is a modern myth that has a story li...,0.0,0.0
3,155657,tt0076759,Star Wars: Episode IV - A New Hope,,28 August 1999,There is a God and his name is George Lucas,I saw for the first time when I was six years ...,0.0,0.0
4,155649,tt0076759,Star Wars: Episode IV - A New Hope,1.0,31 August 1999,Good but over-rated.,"Frankly, I think ""Star wars"" is a great movie....",7.0,53.0


Elimino le 20 parole più frequenti rimuovendo le stopwords. CONTRO: ci sono parole come good, great, best che possono essere rilevanti e non dovrebbero essere eliminate.

In [7]:
from collections import Counter
import re
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Lista di stopwords ufficiali di Scikit-learn
stopwords = ENGLISH_STOP_WORDS

# Unione recensioni
texts = pd.concat([
    others_reviews_df['Review_Text'],
    sw_reviews_df['Review_Text']
]).dropna().astype(str).tolist()

# Tokenizzazione + rimozione stopword (locale, senza toccare i dati originali)
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text) # Rimuove tutto ciò che non è una lettera minuscola o uno spazio (quindi numeri, punteggiatura)
    return [w for w in text.split() if w not in stopwords and len(w) > 2]

# Tokenize & conta
all_tokens = []
for text in texts:
    all_tokens.extend(tokenize(text))

freq_dist = Counter(all_tokens)
common_words = freq_dist.most_common(100)

# Mostra risultati
common_df = pd.DataFrame(common_words, columns=["word", "word_frequency"])
display(common_df.head(20))


Unnamed: 0,word,word_frequency
0,movie,103294
1,film,69598
2,star,62314
3,wars,56288
4,like,45447
5,just,43348
6,good,34327
7,story,31479
8,time,26044
9,great,25726


Uso spaCy e la similarità semantica tra vettori per capire se una parola è concettualmente vicina al dominio "cinema": 1. Carica il modello linguistico di spaCy (en_core_web_md) che ha embedding vettoriali pre-addestrati per ogni parola.
2. Questo modello assegna a ogni parola un vettore numerico che rappresenta il suo significato semantico.
3. Estrae il vettore semantico di riferimento per la parola "movie".
4. Calcola la cosine similarity tra il vettore di movie e il vettore di un'altra parola del testo.
  Se la similarità è maggiore di threshold, la parola è considerata semanticamente vicina a "movie" → quindi potenzialmente legata al cinema e da escludere. 


  CONTRO: compaiono anche parole come "good" perchè i modelli di spaCy si basano su co-occorrenze nei testi. E parole come good, great sono molto usate nei contesti di recensioni di film → spaCy le associa semanticamente a "movie", anche se non sono parole "di dominio cinema", ma di sentiment.


In [9]:
import spacy
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

nlp = spacy.load("en_core_web_md")

cinema_ref = nlp("movie")
common_words = common_df["word"].tolist()

related_words = [
    word for word in common_words
    if word not in ENGLISH_STOP_WORDS and len(word) > 2
    and nlp(word).has_vector
    and cinema_ref.similarity(nlp(word)) > 0.3
]

display(related_words)


['movie',
 'film',
 'wars',
 'just',
 'good',
 'story',
 'time',
 'really',
 'characters',
 'jedi',
 'movies',
 'character',
 'did',
 'episode',
 'films',
 'way',
 'trilogy',
 'bad',
 'plot',
 'better',
 'scenes',
 'know',
 'end',
 'watch',
 'does',
 'seen',
 'little',
 'old',
 'going',
 'lot',
 'things',
 'darth',
 'vader',
 'empire',
 'thing',
 'far',
 'watching',
 'thought',
 'series',
 'actually',
 'come',
 'look',
 'real',
 'bit',
 'actors',
 'saga',
 'times',
 'battle',
 'point',
 'right']

Usa spacy e in più filtro per escludere dall'analisi le parole che identificano il sentiment (presenti nel dizionario nltk). Usa VADER (Valence Aware Dictionary and sEntiment Reasoner), un sentiment analyzer fornito da NLTK, per determinare se una singola parola ha un contenuto emotivo rilevante. Se il valore assoluto del punteggio è maggiore di 0.3 → la parola è considerata emotivamente connotata.

In [8]:
import pandas as pd
import re
from collections import Counter
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import spacy
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Setup
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
nlp = spacy.load("en_core_web_md")

# Stopwords ufficiali da sklearn
stopwords = ENGLISH_STOP_WORDS

# Combina le review
texts = pd.concat([
    others_reviews_df['Review_Text'],
    sw_reviews_df['Review_Text']
]).dropna().astype(str).tolist()

# Tokenizzazione con rimozione stopwords
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return [w for w in text.split() if w not in stopwords and len(w) > 2]

# Frequenze
all_tokens = []
for text in texts:
    all_tokens.extend(tokenize(text))

freq_dist = Counter(all_tokens)
common_words = freq_dist.most_common(100)
common_df = pd.DataFrame(common_words, columns=["word", "count"])

# Calcola parole cinema-related ma NON sentiment
cinema_ref = nlp("movie")
similarity_threshold = 0.3

def is_sentiment_word(word, threshold=0.3):  #con una soglia maggiore ottengo poche parole
    score = sid.polarity_scores(word)['compound']
    return abs(score) > threshold

# Applichiamo filtro combinato
filtered_words = []
for word in common_df["word"]:
    token = nlp(word)
    if token.has_vector and token.similarity(cinema_ref) > similarity_threshold:
        if not is_sentiment_word(word):
            filtered_words.append(word)

# Creiamo DataFrame finale con parole cinema-related
cinema_related_df = common_df[common_df["word"].isin(filtered_words)].reset_index(drop=True)
cinema_related_df.head(20)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\geusafrancesca\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,word,count
0,movie,103294
1,film,69598
2,just,43348
3,story,31479
4,time,26044
5,really,25617
6,characters,25056
7,jedi,21914
8,movies,21425
9,character,19002
