In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC

from bs4 import BeautifulSoup
import re

sns.set_style('darkgrid')

In [6]:
df = pd.read_csv('./data/IMDB Dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [7]:
df.sample(5)

Unnamed: 0,review,sentiment
14667,I just watched this movie last night. Within 3...,negative
33475,I'll say it again: this movie was totally lame...,negative
41664,"OK, so Soldier isn't deep and meaningful like ...",positive
23098,"Considering it was made on a low budget, THE D...",positive
12598,"Diego Armando Maradona was, and still remains ...",positive


In [8]:
df.sentiment.value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [9]:
df_pos = df[df['sentiment']=='positive'][:5000]
df_neg = df[df['sentiment']=='negative'][:5000]

df_reviews = pd.concat([df_pos, df_neg ])

In [10]:
train,test = train_test_split(df_reviews,test_size =0.33,random_state=42)

In [11]:
train_x, train_y = train['review'], train['sentiment']
test_x, test_y = test['review'], test['sentiment']

In [12]:
train_y.value_counts()

sentiment
negative    3378
positive    3322
Name: count, dtype: int64

In [13]:
tfidf = TfidfVectorizer(stop_words='english')
train_x_vector = tfidf.fit_transform(train_x)
test_x_vector = tfidf.transform(test_x)

In [14]:
train_x.shape

(6700,)

In [15]:
train_x_vector.shape

(6700, 44107)

In [16]:
type(train_x_vector)

scipy.sparse._csr.csr_matrix

In [17]:
pd.DataFrame.sparse.from_spmatrix(train_x_vector,
                                  index=train_x.index,
                                  columns=tfidf.get_feature_names_out())

Unnamed: 0,00,000,007,00am,00s,01,01pm,02,04,05,...,émigré,émigrés,était,étc,être,ísnt,île,önsjön,über,überwoman
6746,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
54,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7869,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3725,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
358,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
761,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1685,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
train_x.iloc[0]

"I happened to rent this movie with my sister in hopes of watching a great entertaining movie, that was humorous, however my expectations were let down. This movie was beyond disgusting and revolting for a PG-13 movie, this should have been rated R for the many mature references that went on in this movie. I wouldn't recommend allowing a 13 year old teen see this.<br /><br />Even if no one under the age of 17 is watching this movie, beware of a truly stupid movie, there's no humor in the movie, just a bunch of disgusting sexual references including a small touch of pedophilia, something that shouldn't even be joked about. <br /><br />I would like to know what happened to PG-13 movies, that were actually safe for actual a 13 year old? This is beyond a deplorable movie and should be re-rated."

In [19]:
primera_resenia = pd.DataFrame.sparse.from_spmatrix(train_x_vector,
                                  index=train_x.index,
                                  columns=tfidf.get_feature_names_out()).iloc[0]

In [20]:
primera_resenia

00           0
000          0
007          0
00am         0
00s          0
            ..
ísnt         0
île          0
önsjön       0
über         0
überwoman    0
Name: 6746, Length: 44107, dtype: Sparse[float64, 0]

In [21]:
primera_resenia[primera_resenia != 0]

13               0.45849
17               0.12824
actual          0.091601
actually        0.061461
age             0.088765
allowing         0.12824
beware          0.143046
br              0.124945
bunch           0.093128
deplorable      0.168137
disgusting      0.237945
entertaining    0.081088
expectations    0.103149
great           0.048868
happened        0.176853
hopes           0.114028
humor           0.084465
humorous        0.116516
including       0.087603
joked           0.168137
just            0.037675
know            0.053984
let              0.07195
like            0.036454
mature          0.126021
movie           0.276309
movies            0.0522
old             0.123513
pedophilia      0.172712
pg              0.260154
rated           0.206643
recommend       0.076198
references      0.226901
rent            0.093234
revolting       0.151971
safe            0.119732
sexual          0.098677
shouldn         0.108203
sister          0.095008
small           0.078916


In [22]:
train_x.iloc[0]

"I happened to rent this movie with my sister in hopes of watching a great entertaining movie, that was humorous, however my expectations were let down. This movie was beyond disgusting and revolting for a PG-13 movie, this should have been rated R for the many mature references that went on in this movie. I wouldn't recommend allowing a 13 year old teen see this.<br /><br />Even if no one under the age of 17 is watching this movie, beware of a truly stupid movie, there's no humor in the movie, just a bunch of disgusting sexual references including a small touch of pedophilia, something that shouldn't even be joked about. <br /><br />I would like to know what happened to PG-13 movies, that were actually safe for actual a 13 year old? This is beyond a deplorable movie and should be re-rated."

In [23]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(train_x_vector, train_y)

0,1,2
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",1.0
,"kernel  kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf' Specifies the kernel type to be used in the algorithm. If none is given, 'rbf' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``. For an intuitive visualization of different kernel types see :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`.",'linear'
,"degree  degree: int, default=3 Degree of the polynomial kernel function ('poly'). Must be non-negative. Ignored by all other kernels.",3
,"gamma  gamma: {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses  1 / (n_features * X.var()) as value of gamma, - if 'auto', uses 1 / n_features - if float, must be non-negative. .. versionchanged:: 0.22  The default value of ``gamma`` changed from 'auto' to 'scale'.",'scale'
,"coef0  coef0: float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.",0.0
,"shrinking  shrinking: bool, default=True Whether to use the shrinking heuristic. See the :ref:`User Guide `.",True
,"probability  probability: bool, default=False Whether to enable probability estimates. This must be enabled prior to calling `fit`, will slow down that method as it internally uses 5-fold cross-validation, and `predict_proba` may be inconsistent with `predict`. Read more in the :ref:`User Guide `.",False
,"tol  tol: float, default=1e-3 Tolerance for stopping criterion.",0.001
,"cache_size  cache_size: float, default=200 Specify the size of the kernel cache (in MB).",200
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.",


In [24]:
print(svc.predict(tfidf.transform(['A good movie'])))
print(svc.predict(tfidf.transform(['An excellent movie'])))
print(svc.predict(tfidf.transform(['I did not like this movie at all I gave this movie away'])))

['positive']
['positive']
['negative']


In [25]:
print(svc.score(test_x_vector, test_y))

0.8706060606060606


In [26]:
from sklearn.metrics import f1_score

f1_score(test_y,svc.predict(test_x_vector),
          labels = ['positive','negative'],average=None)

array([0.87400413, 0.86701962])

In [27]:
from sklearn.metrics import classification_report

print(classification_report(test_y,
                            svc.predict(test_x_vector),
                            labels = ['positive','negative']))

              precision    recall  f1-score   support

    positive       0.87      0.88      0.87      1678
    negative       0.88      0.86      0.87      1622

    accuracy                           0.87      3300
   macro avg       0.87      0.87      0.87      3300
weighted avg       0.87      0.87      0.87      3300



In [29]:
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(test_y,
                           svc.predict(test_x_vector),
                           labels = ['positive', 'negative'])
conf_mat

array([[1481,  197],
       [ 230, 1392]])

# Prueba con otros modelos

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



In [34]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Decision Tree": DecisionTreeClassifier()
}

In [37]:
def evaluar_modelo(model, name):
    model.fit(train_x_vector, train_y)
    y_pred = model.predict(test_x_vector)

    print(f"\n===== {name} =====")
    print("Accuracy:", accuracy_score(test_y, y_pred))
    print(classification_report(test_y, y_pred))

In [38]:
for name, model in models.items():
    evaluar_modelo(model, name)


===== Logistic Regression =====
Accuracy: 0.8718181818181818
              precision    recall  f1-score   support

    negative       0.88      0.86      0.87      1622
    positive       0.87      0.88      0.87      1678

    accuracy                           0.87      3300
   macro avg       0.87      0.87      0.87      3300
weighted avg       0.87      0.87      0.87      3300


===== Naive Bayes =====
Accuracy: 0.8542424242424242
              precision    recall  f1-score   support

    negative       0.81      0.91      0.86      1622
    positive       0.91      0.80      0.85      1678

    accuracy                           0.85      3300
   macro avg       0.86      0.86      0.85      3300
weighted avg       0.86      0.85      0.85      3300


===== Decision Tree =====
Accuracy: 0.713939393939394
              precision    recall  f1-score   support

    negative       0.70      0.72      0.71      1622
    positive       0.72      0.71      0.72      1678

    accurac

# Conclusión
Máquina de vectores de soporte fue la mejor elección, no obstante LogisticRegression tuvo resultados casi idénticos, el rendimiento de lo modelos Naive Bayes y Decision Tree quedó por debajo.