In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('movies.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [3]:
print(x)

[['Avatar: The Way of Water' 7.8 2022 ... 'New Zealand' '$350,000,000 '
  '$2,267,946,983 ']
 ["Guillermo del Toro's Pinocchio" 7.6 2022 ... 'USA' '$35,000,000 '
  '$108,967 ']
 ['Bullet Train' 7.3 2022 ... 'Japan' '$85,900,000 ' '$239,268,602 ']
 ...
 ['Gothika' 5.8 2003 ... 'Canada' '$40,000,000 ' '$141,591,324 ']
 ['Ong-Bak: The Thai Warrior' 7.1 2003 ... 'Thailand' 'Unknown'
  '$20,235,426 ']
 ['Open Water' 5.8 2003 ... 'Bahamas' '$500,000 ' '$54,683,487 ']]


In [4]:
print(y)

['United States' 'United States, Mexico, France' 'Japan, United States'
 ... 'United States, France, Canada, Spain' 'Thailand, France, Hong Kong'
 'United States']


In [6]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])

In [7]:
print(x)

[['Avatar: The Way of Water' 7.8 2022.0 ... 'New Zealand' '$350,000,000 '
  '$2,267,946,983 ']
 ["Guillermo del Toro's Pinocchio" 7.6 2022.0 ... 'USA' '$35,000,000 '
  '$108,967 ']
 ['Bullet Train' 7.3 2022.0 ... 'Japan' '$85,900,000 ' '$239,268,602 ']
 ...
 ['Gothika' 5.8 2003.0 ... 'Canada' '$40,000,000 ' '$141,591,324 ']
 ['Ong-Bak: The Thai Warrior' 7.1 2003.0 ... 'Thailand' 'Unknown'
  '$20,235,426 ']
 ['Open Water' 5.8 2003.0 ... 'Bahamas' '$500,000 ' '$54,683,487 ']]


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(sparse=False), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))



In [9]:
print(x)

[[0.0 0.0 0.0 ... 'New Zealand' '$350,000,000 ' '$2,267,946,983 ']
 [0.0 0.0 0.0 ... 'USA' '$35,000,000 ' '$108,967 ']
 [0.0 0.0 0.0 ... 'Japan' '$85,900,000 ' '$239,268,602 ']
 ...
 [0.0 0.0 0.0 ... 'Canada' '$40,000,000 ' '$141,591,324 ']
 [0.0 0.0 0.0 ... 'Thailand' 'Unknown' '$20,235,426 ']
 [0.0 0.0 0.0 ... 'Bahamas' '$500,000 ' '$54,683,487 ']]


In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [11]:
print(y)

[245 355 121 ... 293 172 245]


In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)


In [13]:
print(x_train)

[[0.0 0.0 0.0 ... 'New Zealand' '$350,000,000 ' '$2,267,946,983 ']
 [0.0 0.0 0.0 ... 'Norway' '$15,000,000 ' '$37,394,629 ']
 [0.0 0.0 0.0 ... 'USA' '$8,500,000 ' '$43,709,744 ']
 ...
 [0.0 0.0 0.0 ... 'USA' '$28,000,000 ' '$90,259,536 ']
 [0.0 0.0 0.0 ... 'Bangladesh' '$65,000,000 ' 'Unknown']
 [0.0 0.0 0.0 ... 'USA' 'Unknown' '$23,812,816 ']]


In [14]:
print(x_test)

[[0.0 0.0 0.0 ... 'USA' '$135,000,000 ' '$245,623,848 ']
 [0.0 0.0 0.0 ... 'Unknown' '$80,000,000 ' '$667,094,506 ']
 [0.0 0.0 0.0 ... 'Ireland' '$19,000,000 ' '$32,686,500 ']
 ...
 [0.0 0.0 0.0 ... 'Mexico' '$170,000,000 ' '$386,600,138 ']
 [0.0 0.0 0.0 ... 'USA' '$2,000,000 ' '$5,557,564 ']
 [0.0 0.0 0.0 ... 'Unknown' '$24,000,000 ' '$237,536,126 ']]


In [15]:
print(y_train)

[245 176 245 ... 245 245 245]


In [16]:
print(y_test)

[329 245 336  16 245 245 261 245 159 229 245 245 301 245 245 169 245 275
  54 245 245 245 245 245 245  80 245 261 245 245 245 383  32 229 245 108
 245 245 245 245 170 245 160 334 245 261 245  59 383 245  59 161 261 245
 245 383 229 245 245 245 377 245 229 176 393 245 157 274 237 229 245 245
 245 301 245 313 222 245  83  27 137 184 261 261 348 245 245 245 261 261
 104 245 167 301 393 204 221 251 245 159 245 245 112 245  29 245 245 343
 245 312 346 240 383 194 127 264 383 185 245 245 245 245 287 245 245 404
 245 245  38 261 245 383 245 245 196  23 245 360 245 275 245 363  23  55
 261 307 245 245 245 310 165 261 245 229 245 353 383 332   8 176 334 245
 364 245 229 245 383  23 245  97 130 245 245 334 245 190 392 245 245  57
 245 245 245 245 245 350 200 200  46 351 339  23 383  83 383  97 245 245
 349 211 245 392 155 292 245   3 320 119 245 343 176 337 372 285   6 158
 136 245 245 239 245 383 383 245  93 245 245 256  29 171 200 245 194 245
 129 245 245 245 245 245 227 229 277 245 245 383 24

In [17]:
import pandas as pd

categorical_cols = [3]
x_train_categorical = x_train[:, categorical_cols]
x_test_categorical = x_test[:, categorical_cols]

ohe = pd.get_dummies(pd.concat([pd.DataFrame(x_train_categorical), pd.DataFrame(x_test_categorical)]))
x_train_ohe = ohe[:len(x_train_categorical)]
x_test_ohe = ohe[len(x_train_categorical):]

x_train_processed = np.hstack((x_train[:, :3], x_train_ohe.values, x_train[:, 4:]))
x_test_processed = np.hstack((x_test[:, :3], x_test_ohe.values, x_test[:, 4:]))


  ohe = pd.get_dummies(pd.concat([pd.DataFrame(x_train_categorical), pd.DataFrame(x_test_categorical)]))


In [18]:
print(x_train)

[[0.0 0.0 0.0 ... 'New Zealand' '$350,000,000 ' '$2,267,946,983 ']
 [0.0 0.0 0.0 ... 'Norway' '$15,000,000 ' '$37,394,629 ']
 [0.0 0.0 0.0 ... 'USA' '$8,500,000 ' '$43,709,744 ']
 ...
 [0.0 0.0 0.0 ... 'USA' '$28,000,000 ' '$90,259,536 ']
 [0.0 0.0 0.0 ... 'Bangladesh' '$65,000,000 ' 'Unknown']
 [0.0 0.0 0.0 ... 'USA' 'Unknown' '$23,812,816 ']]


In [19]:
print(x_test)

[[0.0 0.0 0.0 ... 'USA' '$135,000,000 ' '$245,623,848 ']
 [0.0 0.0 0.0 ... 'Unknown' '$80,000,000 ' '$667,094,506 ']
 [0.0 0.0 0.0 ... 'Ireland' '$19,000,000 ' '$32,686,500 ']
 ...
 [0.0 0.0 0.0 ... 'Mexico' '$170,000,000 ' '$386,600,138 ']
 [0.0 0.0 0.0 ... 'USA' '$2,000,000 ' '$5,557,564 ']
 [0.0 0.0 0.0 ... 'Unknown' '$24,000,000 ' '$237,536,126 ']]
