In [1]:
from datasets import load_dataset
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset('imdb')

In [3]:
ds_train = pd.DataFrame(ds['train'])
ds_test = pd.DataFrame(ds['test'])

In [4]:
ds_train.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [5]:
ds_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    25000 non-null  object
 1   label   25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+ KB


In [6]:
ds_train.describe()

Unnamed: 0,label
count,25000.0
mean,0.5
std,0.50001
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [7]:
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(ds_train['text'])
X_test = vectorizer.transform(ds_test['text'])

y_test = ds_test['label']
y_train = ds_train['label']

In [8]:
model = xgb.XGBClassifier()

In [9]:
model.fit(X_train, y_train)

In [10]:
pred = model.predict(X_test)

In [11]:
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_test, pred)

In [12]:
print('Score:', score)

Score: 0.85704


In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'eta': [0.1, 0.3, 0.7, 1],
    'gamma': [0, 1, 10],
    'n_estimators': [100, 200, 300],
    'max_depth': [6, 10, 15], 
    'learning_rate': [0.01, 0.05, 0.1],  
    'subsample': [0.7, 0.8, 1.0],  
    'colsample_bytree': [0.7, 0.8, 1.0],  
    'reg_alpha': [0, 0.1, 0.5], 
    'reg_lambda': [0.1, 0.5, 1.0]  
}

In [None]:
grid_search = GridSearchCV(
     estimator=model,
     param_grid=param_grid,
     scoring='roc_auc',
     cv=5,   
)

grid_search.fit(X_train, y_train)