In [0]:
# Data Processing and Cleaning
import numpy as np
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import seaborn as sns

# Sklearn
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import export_graphviz
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Modeling
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from datetime import timedelta, date


In [0]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')


In [0]:
train.belongs_to_collection = train.belongs_to_collection.notna();
test.belongs_to_collection = test.belongs_to_collection.notna();

train.runtime.fillna(0)
median_runtime = train[train['runtime'] > 0]['runtime'].median()
def fill_runtime(df, median_runtime):
    df['runtime_is_median'] = 0
    df.loc[df.runtime == 0, 'runtime_is_median'] = 1
    df.loc[df.runtime == 0, 'runtime'] = median_runtime
    return df
train = fill_runtime(train, median_runtime)
test = fill_runtime(test, median_runtime)


train.budget.fillna(0)
median_budget = train[train['budget'] > 0]['budget'].median()
def fill_budget(df, median_budget):
    df['budget_is_median'] = 0
    df.loc[df.budget == 0, 'budget_is_median'] = 1
    df.loc[df.budget == 0, 'budget'] = median_budget
    return df
train = fill_budget(train, median_budget)
test = fill_budget(test, median_budget)

In [0]:
test.loc[test['release_date'].isnull() == True, 'release_date'] = '10/19/2001'
def add_date_features(df, col, prefix):
    df[col] = pd.to_datetime(df[col])
    future = df[col] > pd.Timestamp(year=2017,month=12,day=31)
    df.loc[future, col] -= timedelta(days=365.25*100)
    
    df[prefix+'_day_of_week'] = df[col].dt.dayofweek
    df[prefix+'_day_of_year'] = df[col].dt.dayofyear
    df[prefix+'_month'] = df[col].dt.month
    df[prefix+'_year'] = df[col].dt.year
    df[prefix+'_day'] = df[col].dt.day
    df[prefix+'_is_year_end'] = df[col].dt.is_year_end
    df[prefix+'_is_year_start'] = df[col].dt.is_year_start
    df[prefix+'_week'] = df[col].dt.week
    df[prefix+'_quarter'] = df[col].dt.quarter    
    
    df.drop(col, axis = 1, inplace = True)

    return df

train = add_date_features(train, 'release_date', 'release')
test = add_date_features(test, 'release_date', 'release')

In [0]:
train_y = train[['id', 'revenue']].set_index('id')

# 'belongs_to_collection', 'original_language', 'production_companies',

train_x = train.set_index('id')
train_x = train[['budget', 'runtime', 'release_day_of_week', 'release_day_of_year', 'release_month', 'release_year', 'release_day', 'release_is_year_end', 'release_is_year_start', 'release_week', 'release_quarter']]
train_x = pd.get_dummies(train_x)
train_x = train_x.fillna(0)

test_x = test.set_index('id')
test_x = test[['budget','runtime', 'release_day_of_week', 'release_day_of_year', 'release_month', 'release_year', 'release_day', 'release_is_year_end', 'release_is_year_start', 'release_week', 'release_quarter']]
test_x = pd.get_dummies(test_x)
test_x = test_x.fillna(0)


train_x, test_x = train_x.align(test_x, join = 'outer', axis =1, fill_value = 0)

In [0]:
train_X = train_x.values
train_Y = train_y.values

X_test = test_x.values


X_train, X_val, y_train, y_val = train_test_split(train_X, train_Y, test_size=0.5, random_state=56)


X_scaler = StandardScaler()
X_train_scaled  = X_scaler.fit_transform(X_train)
X_val_scaled    = X_scaler.transform(X_val)
X_test_scaled   = X_scaler.transform(X_test)
y_scaler = MinMaxScaler((0,1)) 
y_train_scaled  = y_scaler.fit_transform(np.log(y_train)).ravel() 


In [20]:
X_train_scaled

array([[-0.68103027,  0.53650182, -0.2120655 , ..., -0.10383483,
        -0.70107737, -0.5588243 ],
       [-0.62869891, -0.63244933,  0.56378389, ..., -0.10383483,
        -1.79083495, -1.47895214],
       [-0.53634945, -0.72596542,  0.56378389, ..., -0.10383483,
        -0.42863798, -0.5588243 ],
       ...,
       [-0.47478314,  0.95732424,  0.56378389, ..., -0.10383483,
        -1.17784631, -1.47895214],
       [-0.38243368, -0.63244933, -0.98791488, ..., -0.10383483,
        -0.90540692, -0.5588243 ],
       [ 6.91317385,  2.82764608, -2.53961365, ..., -0.10383483,
         1.34221808,  1.28143136]])

In [21]:
# reg = KNeighborsRegressor().fit(X_train_scaled, y_train_scaled)
reg = LinearRegression().fit(X_train_scaled, y_train_scaled)
print(f'Regression Score: {reg.score(X_train_scaled, y_train_scaled)}')
print(f'Regression Coefficient: {reg.coef_[0]}')
print(f'Regression Intercept: {reg.intercept_}')

Regression Score: 0.22329809541418622
Regression Coefficient: 0.06069759248313029
Regression Intercept: 0.7536090472977895


In [23]:
def score_function(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true))**2))

def score_function2(y_true, y_pred):
    y_pred = np.where(y_pred>0, y_pred, 0)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

def inverseY(y):
    return np.exp(y_scaler.inverse_transform(np.reshape(y, (-1,1))))

y_train_pred = inverseY(reg.predict(X_train_scaled))
y_val_pred = inverseY(reg.predict(X_val_scaled))
y_test_pred = inverseY(reg.predict(X_test_scaled))
print("RMLS Error on Training Dataset:\t", score_function(y_train , y_train_pred), score_function2(y_train, y_train_pred))
print("RMLS Error on Val Dataset:\t", score_function(y_val , y_val_pred), score_function2(y_val , y_val_pred))
print("RMLS Error on Test Dataset:\t Check by submitting on kaggle")


RMLS Error on Training Dataset:	 2.7132690362662517 2.7132690362662517
RMLS Error on Val Dataset:	 2.7387134018926 2.7387134018926
RMLS Error on Test Dataset:	 Check by submitting on kaggle


In [24]:
df_test = test.assign(revenue=y_test_pred)
df_test_y = df_test[['id', 'revenue']].set_index('id')
df_test_y.to_csv("sample_submission.csv")

pd.read_csv("sample_submission.csv").head(5)

Unnamed: 0,id,revenue
0,3001,5192400.0
1,3002,1727438.0
2,3003,6207549.0
3,3004,5182339.0
4,3005,2050884.0
