In [1]:


import time
from IPython.display import clear_output
import numpy    as np
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

set_config(display='diagram') # Useful for display the pipeline

print("Pandas  ", pd.__version__)
print("Sklearn ", skl.__version__) # Try to use 0.24





Pandas   1.2.3
Sklearn  0.24.1


In [2]:
df=pd.read_csv("movies_utf8.csv",encoding="utf8")

In [3]:
df

Unnamed: 0,budget,company,country,director,genre,gross,name,rating,released,runtime,score,star,votes,writer,year
0,8000000.0,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,52287414.0,Stand by Me,R,1986-08-22,89,8.1,Wil Wheaton,299174,Stephen King,1986
1,6000000.0,Paramount Pictures,USA,John Hughes,Comedy,70136369.0,Ferris Bueller's Day Off,PG-13,1986-06-11,103,7.8,Matthew Broderick,264740,John Hughes,1986
2,15000000.0,Paramount Pictures,USA,Tony Scott,Action,179800601.0,Top Gun,PG,1986-05-16,110,6.9,Tom Cruise,236909,Jim Cash,1986
3,18500000.0,Twentieth Century Fox Film Corporation,USA,James Cameron,Action,85160248.0,Aliens,R,1986-07-18,137,8.4,Sigourney Weaver,540152,James Cameron,1986
4,9000000.0,Walt Disney Pictures,USA,Randal Kleiser,Adventure,18564613.0,Flight of the Navigator,PG,1986-08-01,90,6.9,Joey Cramer,36636,Mark H. Baker,1986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6815,0.0,Fox Searchlight Pictures,UK,Mandie Fletcher,Comedy,4750497.0,Absolutely Fabulous: The Movie,R,2016-07-22,91,5.4,Jennifer Saunders,9161,Jennifer Saunders,2016
6816,0.0,Siempre Viva Productions,USA,Paul Duddridge,Drama,28368.0,Mothers and Daughters,PG-13,2016-05-06,90,4.9,Selma Blair,1959,Paige Cameron,2016
6817,3500000.0,Warner Bros. Animation,USA,Sam Liu,Animation,3775000.0,Batman: The Killing Joke,R,2016-07-25,76,6.5,Kevin Conroy,36333,Brian Azzarello,2016
6818,0.0,Borderline Presents,USA,Nicolas Pesce,Drama,25981.0,The Eyes of My Mother,R,2016-12-02,76,6.2,Kika Magalhães,6947,Nicolas Pesce,2016


In [4]:
X = df.drop(columns=["name", 'released','score']) # X DATA (WILL BE TRAIN+VALID DATA)
y = df["score"] #

In [5]:
y = y.round(0) #target gerundet
print(y)

0       8.0
1       8.0
2       7.0
3       8.0
4       7.0
       ... 
6815    5.0
6816    5.0
6817    6.0
6818    6.0
6819    7.0
Name: score, Length: 6820, dtype: float64


In [6]:
df.isnull().sum()

budget      0
company     0
country     0
director    0
genre       0
gross       0
name        0
rating      0
released    0
runtime     0
score       0
star        0
votes       0
writer      0
year        0
dtype: int64

In [7]:
cat_vars  = ["company","country","director","genre","rating","star","writer"]       # x.select_dtypes(include=[object]).columns.values.tolist()
num_vars  = ['gross','budget', 'runtime', 'votes', 'year'] # x.select_dtypes(exclude=[object]).columns.values.tolist()

print("\nNumerical features:\n", num_vars)
print("\nCategorical features:\n", cat_vars)


Numerical features:
 ['gross', 'budget', 'runtime', 'votes', 'year']

Categorical features:
 ['company', 'country', 'director', 'genre', 'rating', 'star', 'writer']


In [8]:
# YOUR CODE HERE
num_4_treeModels = pipeline.Pipeline(steps=[
  ('imputer', impute.SimpleImputer(strategy='mean', add_indicator=False))
])

cat_4_treeModels = pipeline.Pipeline(steps=[
  ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
  ('ordinal', preprocessing.OrdinalEncoder(categories='auto', handle_unknown='use_encoded_value', unknown_value=99999999)) 
  # ('ordinal', preprocessing.OrdinalEncoder(categories='auto', handle_unknown='ignore'))
])

tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_4_treeModels, num_vars),
    ('cat', cat_4_treeModels, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

tree_prepro


In [9]:
num_4_multi_models = pipeline.Pipeline(steps=[
  ('imputer', impute.SimpleImputer(strategy='mean', add_indicator=False)),
  ('scaler',  preprocessing.StandardScaler())
])

cat_4_treeModels = pipeline.Pipeline(steps=[
  ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
  ('onehot', preprocessing.OneHotEncoder(categories='auto', handle_unknown='ignore')) 
]) #handle_unknown='use_encoded_value', unknown_value=99999999 --> what is this ?

multi_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_4_multi_models, num_vars),
    ('cat', cat_4_treeModels, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

multi_prepro

In [10]:
num_4_treeModels
num_4_multi_models

In [11]:
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn import linear_model #for lasso
from sklearn.neural_network import MLPRegressor

In [13]:


# YOUR CODE HERE
#we dont need scaling/norm for these models
#ordinal encoding (sometimes binary) for categorical 
tree_classifiers = {
  "Decision Tree": DecisionTreeClassifier(),
  "Extra Trees":ExtraTreesClassifier(),
  "Random Forest":RandomForestClassifier(),
  "AdaBoost":AdaBoostClassifier(),
  "Skl GBM":GradientBoostingClassifier(),
  "Skl HistGBM":GradientBoostingClassifier(),
  "XGBoost":XGBClassifier()
}

#we need to do scaling/norm for these models
#one hot encoding for categorical
num_classifiers = {
    "Logistic Regression": LogisticRegression()
}


tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}
multi_models = {name: pipeline.make_pipeline(multi_prepro, model) for name, model in num_classifiers.items()}


tree_classifiers["Decision Tree"]
multi_models["Logistic Regression"]



In [14]:
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

# YOUR CODE HERE
#tree model
x_train, x_val, y_train, y_val = model_selection.train_test_split(
    X, y,
    test_size=0.2,
    stratify = y,   
    random_state=42,
)

for model_name, model in tree_classifiers.items():

    start_time = time.time()
    model.fit(x_train, y_train) #fit transforms in fit for train data.
    total_time = time.time() - start_time
    
    pred = model.predict(x_val)  #transforms test data here (inside predict)
    
    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_val, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)


results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')





Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,XGBoost,53.152493,31.869779,71.464065
2,Extra Trees,52.272727,28.59758,14.991811
3,Random Forest,52.1261,26.399607,20.110973
4,Skl HistGBM,50.0,33.433462,141.366805
5,Skl GBM,49.266862,32.127871,124.966176
6,Decision Tree,39.73607,27.858286,1.430242
7,AdaBoost,33.064516,21.492603,6.507784


In [15]:
# YOUR CODE HERE
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

# multi_model
x_train, x_val, y_train, y_val = model_selection.train_test_split(
    X, y,
    test_size=0.2,
    stratify = y,   
    random_state=42,
)

for model_name, model in multi_models.items():

    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
    
    pred = model.predict(x_val)  
    
    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_val, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)


results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Logistic Regression,53.29912,29.892268,16.938706
