In [1]:
import pandas as pd
import numpy as np

In [2]:
training_data = pd.read_csv("input_data_train.csv")
pred_data = pd.read_csv("input_data_pred.csv")

In [3]:
training_data.describe()

Unnamed: 0,location,product,sa_quantity,temp_mean,temp_max,temp_min,sunshine_quant,price
count,10110600.0,10110600.0,10110600.0,9706176.0,9706176.0,9706176.0,9706176.0,8827168.0
mean,2419.8,185.0,8.604586,8.665875,11.73502,5.596728,362.9658,2.875698
std,2155.645,106.5207,13.03595,5.113955,5.717526,4.884786,181.2255,1.911442
min,55.0,1.0,0.0,-6.545,-2.93,-10.91,0.0,0.0
25%,437.0,93.0,2.0,4.96,7.82,1.97,225.0,1.6
50%,2166.5,185.0,5.0,8.51,11.42,5.56,338.0,2.483333
75%,3353.0,277.0,10.0,12.47,15.98,9.23,488.0,3.3
max,9137.0,369.0,819.0,25.63,30.28,20.98,1022.0,56.1


In [4]:
training_data.head()

Unnamed: 0,location,product,date,sa_quantity,temp_mean,temp_max,temp_min,sunshine_quant,event,price
0,1193,1,2011-12-31,5,10.635,11.19,10.08,56.0,New Year's Eve,1.48
1,1193,1,2012-01-01,0,8.845,10.76,6.93,213.0,New Year's Day,
2,1193,1,2012-01-02,2,5.51,7.24,3.78,285.0,New Year's Day observed,1.5
3,1193,1,2012-01-03,2,8.915,11.61,6.22,205.0,2nd January (substitute day),1.5
4,1193,1,2012-01-04,0,7.38,8.87,5.89,224.0,,


In [5]:
from sklearn import preprocessing

# Given a data frame and the columns containing numerical continuous features,
# output the data frame with normalized values per column.
def normalize_features(data_df, columns):
    for feature in columns:
        continuous_data = data_df[feature]
        continuous_data_values = continuous_data.values.reshape(-1, 1)

        min_max_scaler = preprocessing.MinMaxScaler()
        continuous_data_values_scaled = min_max_scaler.fit_transform(continuous_data_values)

        data_df[feature] = continuous_data_values_scaled

def encode_categorical_features(data_df, columns):
    encoders = {}
    for feature in columns:
        le = preprocessing.LabelEncoder()
        le.fit(data_df[feature].values)
        data_df[feature] = le.transform(data_df[feature].values)
        encoders[feature] = le
    return encoders
        
# Performs preprocessing on the data for the training set.
def preprocess(df, continuous_feature_columns=[], categorical_feature_columns=[], 
               label_column="sa_quantity", date_column="date"):
    # Fill in empty values with placeholders.
    df = df.fillna({"price": 0, "event": "N/A", "location": 0, "date": "N/A", "sa_quantity": 0,
                    "temp_mean":  0, "temp_max": 0, "temp_min": 0, "sunshine_quant": 0})

    # Normalize numerical features
    df_Y = df[label_column]
    normalize_features(df, continuous_feature_columns)
    
    # Convert date column to date attributes
    df["day"] = pd.to_datetime(df[date_column]).dt.day
    df["month"] = pd.to_datetime(df[date_column]).dt.month
    df["year"] = pd.to_datetime(df[date_column]).dt.year
    
    # Remove date_column from training set.
    df_columns = list(df.columns.values)
    df_columns.remove(date_column)
    df = df[df_columns]
    
    categorical_feature_columns.append("day")
    categorical_feature_columns.append("month")
    categorical_feature_columns.append("year")
    
    # Convert event to categorical value.
    encoders = encode_categorical_features(df, categorical_feature_columns)
    
    # Remove label_column from training set.
    df_columns = list(df.columns.values)
    df_columns.remove(label_column)
    df_X = df[df_columns]
    
    return df_X, df_Y, encoders

continuous_feature_columns = ["temp_mean", "temp_max", "temp_min", "sunshine_quant", "price"]
categorical_feature_columns = ["event", "location", "product"]

X, y, encoders = preprocess(training_data, continuous_feature_columns, categorical_feature_columns)

In [6]:
X.head()

Unnamed: 0,location,product,temp_mean,temp_max,temp_min,sunshine_quant,event,price,day,month,year
0,18,0,0.533955,0.425173,0.6582,0.054795,58,0.026381,30,11,0
1,18,0,0.478322,0.412225,0.559423,0.208415,56,0.0,0,0,1
2,18,0,0.37467,0.306233,0.460646,0.278865,57,0.026738,1,0,1
3,18,0,0.480497,0.43782,0.537159,0.200587,1,0.026738,2,0,1
4,18,0,0.432789,0.355315,0.526811,0.219178,54,0.0,3,0,1


In [7]:
y.head()

0    5
1    0
2    2
3    2
4    0
Name: sa_quantity, dtype: int64

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y)
reg = RandomForestRegressor().fit(X_train, y_train)

y_test_ = reg.predict(X_test)
print("MSE: ",mean_squared_error(y_test, y_test_))
print("Score: ", reg.score(X_test, y_test))



MSE:  32.73808464779537
Score:  0.8080019641551746


In [9]:
import pickle

# Save to file in the current working directory
pkl_filename = "rf_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(reg, file)

In [10]:
pred_data = pd.read_csv("input_data_pred.csv")

In [27]:
def encode_categorical_features_existing_encoders(data_df, columns, encoders):
    for feature in columns:
        le = encoders[feature]
        data_df[feature] = le.transform(data_df[feature].values)
        encoders[feature] = le

# Performs preprocessing on the data for the training set.
def preprocess_pred(df, continuous_feature_columns=[], categorical_feature_columns=[], encoders={},
                    date_column="date"):
    # Fill in empty values with placeholders.
    df = df.fillna({"price": 0, "event": "N/A", "location": 0, "date": "N/A", "sa_quantity": 0,
                    "temp_mean":  0, "temp_max": 0, "temp_min": 0, "sunshine_quant": 0})

    # Normalize numerical features
    normalize_features(df, continuous_feature_columns)
    
    # Convert date column to date attributes
    df["day"] = pd.to_datetime(df[date_column]).dt.day
    df["month"] = pd.to_datetime(df[date_column]).dt.month
    df["year"] = pd.to_datetime(df[date_column]).dt.year
    
    # Remove date_column from training set.
    df_columns = list(df.columns.values)
    df_columns.remove(date_column)
    df = df[df_columns]
    
    categorical_feature_columns.append("day")
    categorical_feature_columns.append("month")
    categorical_feature_columns.append("year")
    
    # Convert event to categorical value.
    encode_categorical_features_existing_encoders(df, categorical_feature_columns, encoders)
    
    return df

continuous_feature_columns = ["temp_mean", "temp_max", "temp_min", "sunshine_quant", "price"]
categorical_feature_columns = ["event", "location", "product"]

pred_X = preprocess_pred(pred_data, continuous_feature_columns, categorical_feature_columns, encoders)

In [28]:
pred_y = reg.predict(pred_X)

In [29]:
output_df = pred_data

In [30]:
output_df["sa_quantity"] = pred_y.astype(int)

In [25]:
'''
categorical_feature_columns = ["event", "location", "product", "day", "month", "year"]

def decode_categorical_features_existing_encoders(data_df, columns, encoders):
    for feature in columns:
        le = encoders[feature]
        data_df[feature] = le.inverse_transform(data_df[feature].values)

decode_categorical_features_existing_encoders(output_df, categorical_feature_columns, encoders)
'''

In [32]:
output_df

Unnamed: 0,location,product,date,temp_mean,temp_max,temp_min,sunshine_quant,event,price,sa_quantity
0,1193,1,2013-07-01,13.905,16.27,11.54,456.0,,1.500000,11
1,1193,1,2013-07-02,13.025,14.94,11.11,347.0,,1.500000,14
2,1193,1,2013-07-03,14.390,15.94,12.84,297.0,,1.500000,8
3,1193,1,2013-07-04,15.015,17.52,12.51,385.0,,1.500000,7
4,1193,1,2013-07-05,16.890,21.81,11.97,622.0,,1.500000,7
5,1193,1,2013-07-06,15.600,19.67,11.53,680.0,,1.500000,7
6,1193,1,2013-07-07,19.640,26.10,13.18,925.0,,1.500000,7
7,1193,1,2013-07-08,21.750,26.24,17.26,904.0,,1.500000,7
8,1193,1,2013-07-09,20.070,25.65,14.49,834.0,Ramadan Start,1.500000,5
9,1193,1,2013-07-10,20.585,25.42,15.75,793.0,,,0


In [34]:
output_df.to_csv("output_data_pred.csv")