# IMDB Movies_Drop Columns

In [159]:
import pandas as pd
import numpy as np

In [160]:
movies = pd.read_csv('IMDb movies.csv',low_memory=False)

In [161]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_title_id          85855 non-null  object 
 1   title                  85855 non-null  object 
 2   original_title         85855 non-null  object 
 3   year                   85855 non-null  object 
 4   date_published         85855 non-null  object 
 5   genre                  85855 non-null  object 
 6   duration               85855 non-null  int64  
 7   country                85791 non-null  object 
 8   language               85022 non-null  object 
 9   director               85768 non-null  object 
 10  writer                 84283 non-null  object 
 11  production_company     81400 non-null  object 
 12  actors                 85786 non-null  object 
 13  description            83740 non-null  object 
 14  avg_vote               85855 non-null  float64
 15  vo

In [162]:
movies.describe(include='all')

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
count,85855,85855,85855,85855.0,85855.0,85855,85855.0,85791,85022,85768,...,85786,83740,85855.0,85855.0,23710,15326,31016,13305.0,78258.0,74058.0
unique,85855,82094,80852,113.0,22012.0,1257,,4907,4377,34733,...,85729,83611,,,4642,14857,30414,,,
top,tt0052609,Anna,Anna,2017.0,2010.0,Drama,,USA,English,Jesús Franco,...,"Nobuyo Ôyama, Noriko Ohara, Michiko Nomura, Ka...",The story of,,,$ 1000000,$ 1000000,$ 8144,,,
freq,1,10,10,3329.0,113.0,12543,,28511,35939,87,...,9,15,,,758,19,15,,,
mean,,,,,,,100.351418,,,,...,,,5.898656,9493.49,,,,55.896881,46.040826,27.479989
std,,,,,,,22.553848,,,,...,,,1.234987,53574.36,,,,17.784874,178.511411,58.339158
min,,,,,,,41.0,,,,...,,,1.0,99.0,,,,1.0,1.0,1.0
25%,,,,,,,88.0,,,,...,,,5.2,205.0,,,,43.0,4.0,3.0
50%,,,,,,,96.0,,,,...,,,6.1,484.0,,,,57.0,9.0,8.0
75%,,,,,,,108.0,,,,...,,,6.8,1766.5,,,,69.0,27.0,23.0


# Preprocessing

In [163]:
#drop na values
#movies.dropna(subset = ['budget', 'usa_gross_income', 'worlwide_gross_income', 'metascore'], inplace = True)

#drop columns won't be used
movies.drop(columns = ['budget', 'usa_gross_income', 'worlwide_gross_income', 'metascore', 'imdb_title_id', 
                       'title', 'original_title', 'date_published',  'description'], inplace = True)

movies.dropna(inplace = True)

In [164]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66258 entries, 0 to 85851
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   year                  66258 non-null  object 
 1   genre                 66258 non-null  object 
 2   duration              66258 non-null  int64  
 3   country               66258 non-null  object 
 4   language              66258 non-null  object 
 5   director              66258 non-null  object 
 6   writer                66258 non-null  object 
 7   production_company    66258 non-null  object 
 8   actors                66258 non-null  object 
 9   avg_vote              66258 non-null  float64
 10  votes                 66258 non-null  int64  
 11  reviews_from_users    66258 non-null  float64
 12  reviews_from_critics  66258 non-null  float64
dtypes: float64(3), int64(2), object(8)
memory usage: 7.1+ MB


In [165]:
#The publish year of a movie is recorded as 'TV movie 2019'. Replace it with '2019'.
movies['year'] = movies['year'].replace('TV Movie 2019','2019')
movies['year'] = movies['year'].astype(int)

In [166]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66258 entries, 0 to 85851
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   year                  66258 non-null  int64  
 1   genre                 66258 non-null  object 
 2   duration              66258 non-null  int64  
 3   country               66258 non-null  object 
 4   language              66258 non-null  object 
 5   director              66258 non-null  object 
 6   writer                66258 non-null  object 
 7   production_company    66258 non-null  object 
 8   actors                66258 non-null  object 
 9   avg_vote              66258 non-null  float64
 10  votes                 66258 non-null  int64  
 11  reviews_from_users    66258 non-null  float64
 12  reviews_from_critics  66258 non-null  float64
dtypes: float64(3), int64(3), object(7)
memory usage: 7.1+ MB


# Naive Bayes

In [167]:
label = pd.cut(movies['avg_vote'],5,right=True)
movies = movies.drop(columns=['avg_vote'])

In [168]:
#encode
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
movies = movies.apply(le.fit_transform)
label = label.apply(le.fit_transform)

In [176]:
#split training and test sets
from sklearn.model_selection import train_test_split
movies_train, movies_test,label_train,label_test = train_test_split(movies, label,test_size=0.2, random_state=29)
print(movies_train.shape)
print(movies_test.shape)
print(label_train.shape)
print(label_test.shape)

(53006, 12)
(13252, 12)
(53006,)
(13252,)


In [177]:
#CategoricalNB
from sklearn.naive_bayes import CategoricalNB
CNB = CategoricalNB()
CNB.fit(movies_train,label_train)
print('Accuracy score: %f'%CNB.score(movies_test,label_test))

Accuracy score: 0.620057


In [178]:
#BernoulliNB
from sklearn.naive_bayes import BernoulliNB
BNB = BernoulliNB()
BNB.fit(movies_train,label_train)

#evaluate the result
print('Accuracy score: %f'%BNB.score(movies_test,label_test))

Accuracy score: 0.452007


In [179]:
#GaussianNB
from sklearn.naive_bayes import GaussianNB
GNB = GaussianNB()
GNB.fit(movies_train,label_train)

#evaluate the ressult
print('Accuracy score: %f'%GNB.score(movies_test,label_test))

Accuracy score: 0.393676
