In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_profiling
import time

from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
CATEGORICAL_COLUMNS = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

In [3]:
data = pd.read_csv('weatherAUS.csv')

In [4]:
data = data.fillna(data.mean())
data = data.dropna()
data.drop_duplicates(inplace = True)

In [5]:
data.drop('Date', axis=1, inplace=True)
data['RainTomorrow'] = np.where(data['RainTomorrow'] == 'No', 0, 1)

In [6]:
features = {}

features[2] = list(data.drop('RainTomorrow', axis=1).columns)
features[0] = [f for f in features[2] if f not in ['MinTemp', 'MaxTemp', 'Evaporation', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Temp9am', 'Temp3pm']]
features[1] = [f for f in features[2] if f not in ['MinTemp', 'MaxTemp', 'Evaporation', 'Temp9am', 'Temp3pm']]

In [7]:
def encode_column(data, column):
    encoded_columns = pd.get_dummies(data[column], prefix=column)
    return pd.concat([data, encoded_columns], axis=1).drop(column, axis=1)

In [8]:
records = []

encoded_data = data
for category in CATEGORICAL_COLUMNS:
    encoded_data = encode_column(encoded_data, category)

for train_size in [0.2, 0.5, 0.8]:

    train, test = train_test_split(encoded_data, test_size=1 - train_size)
    print(len(train), 'train examples')
    print(len(test), 'test examples')
    
    for features_version in [0, 1, 2]:
        
        print("Train size: ", train_size, "Features version: ", features_version)
        
        columns_X = [col for col in encoded_data if any([col.startswith(feature) for feature in features[features_version]])]
        columns_Y = ["RainTomorrow"]
        
        train_ds = train[[*columns_X, *columns_Y]]
        test_ds = test[[*columns_X, *columns_Y]]
        
        train_X = train_ds[columns_X]
        train_Y = train_ds[columns_Y]
        test_X = test_ds[columns_X]
        test_Y = test_ds[columns_Y]
        
        time_before = time.perf_counter()
        model = RandomForestClassifier(n_estimators = 50).fit(train_X, train_Y)
        time_taken = time.perf_counter() - time_before
        
        train_score = model.score(train_X, train_Y)
        test_score = model.score(test_X, test_Y)
        
        records.append([time_taken, train_score, test_score])

24742 train examples
98968 test examples
Train size:  0.2 Features version:  0




Train size:  0.2 Features version:  1




Train size:  0.2 Features version:  2




61855 train examples
61855 test examples
Train size:  0.5 Features version:  0




Train size:  0.5 Features version:  1




Train size:  0.5 Features version:  2




98968 train examples
24742 test examples
Train size:  0.8 Features version:  0




Train size:  0.8 Features version:  1




Train size:  0.8 Features version:  2




In [14]:
results = []
i = 0
for train_size in [0.2, 0.5, 0.8]:
    for features_version in [0, 1, 2]:
        results.append({
            'split': train_size, 
            'features': features_version,
            'time': round(records[i][0], 3) ,
            'train_acc': round(records[i][1], 3),
            'test_acc': round(records[i][2], 3)
        })
        
        i = i+1

In [15]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,split,features,time,train_acc,test_acc
0,0.2,0,1.64,0.999,0.844
1,0.2,1,2.007,1.0,0.851
2,0.2,2,2.338,1.0,0.852
3,0.5,0,4.282,0.998,0.847
4,0.5,1,6.707,1.0,0.855
5,0.5,2,6.018,1.0,0.856
6,0.8,0,7.054,0.998,0.845
7,0.8,1,10.209,1.0,0.856
8,0.8,2,10.439,1.0,0.854


In [16]:
results_df.to_csv('RF-Australia.csv')
print(results_df.to_latex())

\begin{tabular}{lrrrrr}
\toprule
{} &  split &  features &    time &  train\_acc &  test\_acc \\
\midrule
0 &    0.2 &         0 &   1.640 &      0.999 &     0.844 \\
1 &    0.2 &         1 &   2.007 &      1.000 &     0.851 \\
2 &    0.2 &         2 &   2.338 &      1.000 &     0.852 \\
3 &    0.5 &         0 &   4.282 &      0.998 &     0.847 \\
4 &    0.5 &         1 &   6.707 &      1.000 &     0.855 \\
5 &    0.5 &         2 &   6.018 &      1.000 &     0.856 \\
6 &    0.8 &         0 &   7.054 &      0.998 &     0.845 \\
7 &    0.8 &         1 &  10.209 &      1.000 &     0.856 \\
8 &    0.8 &         2 &  10.439 &      1.000 &     0.854 \\
\bottomrule
\end{tabular}

