In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import re
import time
import datetime
from collections import defaultdict
import ast
from math import sqrt

In [2]:
train_data = pd.read_csv('../data/train.csv')
submission = pd.read_csv('../data/sample_submission.csv')

In [3]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
#处理original_language
train_data['original_language']=le.fit_transform(train_data['original_language'])
#处理runtime
train_data['runtime']=le.fit_transform(train_data['runtime'])
#处理crew
train_data['crew_size'] = train_data['crew'].apply(lambda y: {} if pd.isnull(y) else sorted(map(lambda x: x['name'], eval(y)))).apply(lambda x: len(x))
#处理cast
cast_gender = train_data['cast'].apply(lambda y: {} if pd.isnull(y) else sorted(map(lambda x: x['gender'], eval(y))))
train_data['log_revenue'] = np.log(train_data['revenue'])

In [4]:
train_data['runtime'].fillna(6000000, inplace=True)

In [5]:
#Extract release_date
train_data[['release_month', 'release_day', 'release_year']] = train_data['release_date'].str.split('/', expand = True).replace(np.nan, -1)
train_data['release_month'] = train_data['release_month'].astype(int)
train_data['release_day'] = train_data['release_day'].astype(int)
train_data['release_year'] = train_data['release_year'].astype(int)
train_data['day_of_Week'] = (pd.to_datetime(train_data["release_date"]).dt.dayofweek)
train_data.loc[(train_data['release_year'] <= 19) & (train_data['release_year'] < 100), "release_year"] += 2000
train_data.loc[(train_data['release_year'] > 19)  & (train_data['release_year'] < 100), "release_year"] += 1900   

In [6]:
data_companies = train_data['production_companies'].apply(lambda y: {} if pd.isnull(y) else sorted(map(lambda x: x['name'], eval(y)))).map(lambda x: ','.join(map(str, x)))
data_companies = data_companies.str.get_dummies(sep=',')
number_companies = data_companies.sum(axis=1)
train_data['number_companies'] = number_companies

In [7]:
train_data.shape

(3000, 30)

In [8]:
feature_names = ['release_year', 'release_day', 'release_month', 'original_language',
                 'budget', 'popularity', 'runtime','number_companies','crew_size']

In [9]:
X=train_data[feature_names]
y=np.array(train_data['log_revenue'])
X.shape

(3000, 9)

In [10]:
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=0)

model=RandomForestRegressor(random_state=1)

In [11]:
model.fit(X_train,y_train)

RandomForestRegressor(random_state=1)

In [12]:
y_pre=model.predict(X_test)

In [13]:
print(sqrt(mean_absolute_error(y_pre,y_test)))

1.19728144561282


In [14]:
score=model.score(X_train,y_train)
score

0.9278841417943807

In [15]:
score=model.score(X_test,y_test)
score

0.4823094544681238

In [16]:
# test sample
test_data = pd.read_csv('../data/test.csv')

In [17]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
#处理original_language
test_data['original_language']=le.fit_transform(test_data['original_language'])
#处理runtime
test_data['runtime']=le.fit_transform(test_data['runtime'])
#处理crew
test_data['crew_size'] = test_data['crew'].apply(lambda y: {} if pd.isnull(y) else sorted(map(lambda x: x['name'], eval(y)))).apply(lambda x: len(x))
#处理cast
cast_gender = train_data['cast'].apply(lambda y: {} if pd.isnull(y) else sorted(map(lambda x: x['gender'], eval(y))))
test_data['cast_female_count']  = cast_gender.apply(lambda x: (pd.Series(x)==1).sum())
test_data['cast_male_count'] = cast_gender.apply(lambda x: (pd.Series(x)==2).sum())

  test_data['cast_female_count']  = cast_gender.apply(lambda x: (pd.Series(x)==1).sum())
  test_data['cast_male_count'] = cast_gender.apply(lambda x: (pd.Series(x)==2).sum())


In [18]:
test_data['runtime'].fillna(6000000, inplace=True)

In [19]:
#Extract release_date
test_data[['release_month', 'release_day', 'release_year']] = test_data['release_date'].str.split('/', expand = True).replace(np.nan, -1)
test_data['release_month'] = test_data['release_month'].astype(int)
test_data['release_day'] = test_data['release_day'].astype(int)
test_data['release_year'] = test_data['release_year'].astype(int)
test_data['day_of_Week'] = (pd.to_datetime(test_data["release_date"]).dt.dayofweek)
test_data.loc[(test_data['release_year'] <= 19) & (test_data['release_year'] < 100), "release_year"] += 2000
test_data.loc[(test_data['release_year'] > 19)  & (test_data['release_year'] < 100), "release_year"] += 1900 
data_companies = test_data['production_companies'].apply(lambda y: {} if pd.isnull(y) else sorted(map(lambda x: x['name'], eval(y)))).map(lambda x: ','.join(map(str, x)))
data_companies = data_companies.str.get_dummies(sep=',')
number_companies = data_companies.sum(axis=1)
test_data['number_companies'] = number_companies

In [20]:
test_data.shape

(4398, 30)

In [25]:
X_test=test_data[feature_names]

In [34]:
y_prediction=model.predict(X_test)
y_prediction=np.expm1(y_prediction)
y_prediction

array([  642368.04076839,   571171.02331857,  8042705.92874621, ...,
       57837122.02900999,  1459578.50278949,   862970.64694829])

In [35]:
pred = pd.DataFrame(y_prediction)
pred

Unnamed: 0,0
0,6.423680e+05
1,5.711710e+05
2,8.042706e+06
3,8.784395e+06
4,3.904903e+05
...,...
4393,1.211966e+08
4394,1.797541e+07
4395,5.783712e+07
4396,1.459579e+06


In [36]:
datasets = pd.concat([submission['id'], pred], axis=1)
datasets.columns =['id', 'revenue']
datasets.to_csv('../data/sample_submission.csv', index=False)