In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('IMDB Dataset.csv')

In [5]:
# 50k de datos son demasiados elementos para ser utilizados
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [7]:
df_positive = df[df['sentiment']=='positive'][:9000]
df_negative = df[df['sentiment']=='negative'][:1000]
#50k de elementos es sobre entrenar el modelo que puede traer problemas y se desbalancea escogiendo proporicones diferentes.

In [9]:
df_des = pd.concat([df_positive, df_negative])
print(df_des.value_counts('sentiment'))
print(df.value_counts('sentiment'))

sentiment
positive    9000
negative    1000
dtype: int64
sentiment
negative    25000
positive    25000
dtype: int64


In [11]:
df_des

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
...,...,...
2000,Stranded in Space (1972) MST3K version - a ver...,negative
2005,"I happened to catch this supposed ""horror"" fli...",negative
2007,waste of 1h45 this nasty little film is one to...,negative
2010,Warning: This could spoil your movie. Watch it...,negative


# 1. Data set desbalancedo

In [13]:
from imblearn.under_sampling import RandomUnderSampler
#Oversmapling increase number of elements to balance the data


In [14]:
rus = RandomUnderSampler()

df_bal, df_bal['sentiment']  = rus.fit_resample(df_des[['review']], df_des['sentiment'])
df_bal

Unnamed: 0,review,sentiment
3,Basically there's a family where a little boy ...,negative
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
10,Phil the Alien is one of those quirky films wh...,negative
11,I saw this movie when I was about 12 when it c...,negative
...,...,...
16852,I do not expect this film to be well understoo...,positive
13458,"As i watched ""Wirey Spindell"" i couldnt but la...",positive
17992,This film is a very descent remake of the famo...,positive
12250,Altman and Scorsese have twisted sex together ...,positive


In [17]:
df_bal.value_counts(df['sentiment'])
# Cantidad de elementos neg and pos igual

sentiment
negative    1000
positive    1000
dtype: int64

# 2. Separado Data para entrenat (Train) y testear (Test)

In [19]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_bal, test_size=0.2, random_state=42)

In [21]:
train_x, train_y = train['review'], train['sentiment']
test_x, test_y = test['review'], test['sentiment']

#### Representación de Texto (Bag of Words)

In [None]:
# CountVectorizer frecuencia con la cual un palabra aparece en una oración y su peso positivo o negativo
# Tfidf Relevancia que tiene una palabra en una oracion que no se repite en otras reviews

## 2.1 CountVectorizer

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

text = ["Amo escribir codigo en Python. Amo el código en Python", "Odio escribir codigo en Java. Odio el código en Java"]

df = pd.DataFrame({'review' : ['review1', 'review2'], 'text':text})
cv=CountVectorizer()
cv_matrix=cv.fit_transform(df['text'])
df_dtm = pd.DataFrame(cv_matrix.toarray(), index=df['review'].values, columns=cv.get_feature_names_out())
df_dtm

Unnamed: 0,amo,codigo,código,el,en,escribir,java,odio,python
review1,2,1,1,1,2,1,0,0,2
review2,0,1,1,1,2,1,2,2,0


## 2.2 Tfidf

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
text = ["Amo escribir codigo en Python. Amo el código en Python",
        "Odio escribir codigo en Java. Odio el código en Java"]

df = pd.DataFrame({'review': ['review1', 'review2'], 'text':text})
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text'])
df_dtm = pd.DataFrame(tfidf_matrix.toarray(), index=df['review'].values, columns=tfidf.get_feature_names_out())
df_dtm

Unnamed: 0,amo,codigo,código,el,en,escribir,java,odio,python
review1,0.576152,0.204969,0.204969,0.204969,0.409937,0.204969,0.0,0.0,0.576152
review2,0.0,0.204969,0.204969,0.204969,0.409937,0.204969,0.576152,0.576152,0.0


## 2.3 Data transform into numeric vector

In [27]:
tfidf = TfidfVectorizer(stop_words='english')
train_x_vec = tfidf.fit_transform(train_x)
#fit encuentra parametros para la data 
#transform aplica los parametros o valores numericos a la data 
test_x_vec = tfidf.transform(test_x)

In [71]:
train_x_vec

<1600x22704 sparse matrix of type '<class 'numpy.float64'>'
	with 141727 stored elements in Compressed Sparse Row format>

# 3. Support Vector Machine

In [188]:
from sklearn.svm import SVC

svc=SVC()
svc.fit(train_x_vec, train_y)

In [84]:
## Testeo 
print(svc.predict(tfidf.transform(['A good movie'])))
print(svc.predict(tfidf.transform(['An excellent movie'])))
print(svc.predict(tfidf.transform(['I did not like this movie at all, I gave this movie away'])))

['positive']
['positive']
['negative']


# 4. Decision Tree

In [186]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(train_x_vec, train_y)


# 5. Naive Bayes

In [184]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(train_x_vec.toarray(), train_y)

# 6. Logistic Regression

In [182]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_x_vec, train_y)

# 7. Evaluacion de modelos 

## 7.1. Precision del modelo

In [190]:
print(svc.score(test_x_vec, test_y))
print(tree.score(test_x_vec, test_y))
print(gnb.score(test_x_vec.toarray(), test_y))
print(lr.score(test_x_vec, test_y))

#Mayor prediccion de sentimientos correctos.

0.8375
0.7
0.5875
0.83


## 7.2. F1 Score

In [149]:
# Toma en cuenta dos variables importantes
#Recall y precision
from sklearn.metrics import f1_score

# f1_score(test_y, svc.predict(test_x_vec), labels=['positive', 'negative'], average=None)
# f1_score(test_y, tree.predict(test_x_vec), labels = ['positive', 'negative'], average=None)
# f1_score(test_y, gnb.predict(test_x_vec.toarray()), labels=['positive', 'negative'], average=None)
f1_score(test_y, lr.predict(test_x_vec), labels=['positive', 'negative'], average=None)

array([0.83333333, 0.82653061])

## 7.3. Clasification report

In [153]:
from sklearn.metrics import classification_report

print(classification_report(test_y, svc.predict(test_x_vec), labels=['positive', 'negative']))

              precision    recall  f1-score   support

    positive       0.82      0.86      0.84       201
    negative       0.85      0.81      0.83       199

    accuracy                           0.84       400
   macro avg       0.84      0.84      0.84       400
weighted avg       0.84      0.84      0.84       400



## 7.4. Confusion matrix

In [157]:
from sklearn.metrics import confusion_matrix

confusion_matrix(test_y, svc.predict(test_x_vec), labels=['positive', 'negative'])

array([[173,  28],
       [ 37, 162]], dtype=int64)

In [161]:
#        Actual Values
    # Positive(1) Negative(0)   Pre-
#P(1)      TP         FP       dicted
#N(0)      FN         TN       Values

# 8. Model optimization

## 8.1. GridSearchCV

In [192]:
from sklearn.model_selection import GridSearchCV

parameters = {'C':[1,4,8,16,32], 'kernel':['linear', 'rbf']}
svc=SVC()
svc_grid = GridSearchCV(svc, parameters, cv=5)
# cross validation = 5
svc_grid.fit(test_x_vec, test_y)

In [193]:
print(svc_grid.best_estimator_)
print(svc_grid.best_params_)

SVC(C=4, kernel='linear')
{'C': 4, 'kernel': 'linear'}


In [194]:
svc_grid.best_score_

0.8125

In [None]:
# Al final el score disminuyo asi que no tengo claridad de proque empeoro el rendimiento dle modelo jaja