In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [2]:
# 為了處理方便，把 'train.csv' 和 'test.csv' 合併起來，'test.csv'的 Weather 欄位用 0 補起來。
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test['Weather'] = np.zeros((len(df_test),))

# 以 train_end_idx 作為 'train.csv' 和 'test.csv' 分界列，
train_end_idx = len(df)
df = pd.concat([df, df_test], sort=False)

In [59]:
# 將非數值欄位拿掉
df_nocat = df.drop(columns = [col for col in df.columns if df[col].dtype == object])
df_cat = df[[col for col in df.columns if df[col].dtype == object if col!="Date"]]

# 將 missing value 補 0
# df = df.fillna(0)
response = df_nocat.Weather
df_nocat.drop(['Weather'], axis=1, inplace=True)

Date Object

In [20]:
df_nocat.describe();
df_cat.head()

Unnamed: 0,Loc,WindDir,DayWindDir,NightWindDir
0,,ENE,ENE,ENE
1,Canberra,E,,SE
2,Woomera,S,S,SW
3,Tuggeranong,NNE,W,
4,Hobart,N,N,WNW


Missing Values

In [60]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
imputer = imputer.fit(df_nocat.values)
df_impute = pd.DataFrame(imputer.transform(df_nocat.values), columns = df_nocat.columns)

Normalize

In [61]:
from sklearn.preprocessing import normalize

df_norm = normalize(df_impute, axis=0, norm='l2')
df_norm = pd.DataFrame(df_norm, columns = df_impute.columns)

Label Encode

In [62]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

for i in range(df_cat.shape[1]):
    df_cat.iloc[:, i] = encoder.fit_transform(df_cat.iloc[:, i])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a D

In [63]:
df_cat.head()

Unnamed: 0,Loc,WindDir,DayWindDir,NightWindDir
0,49,1,1,1
1,9,0,16,9
2,48,8,8,12
3,40,5,13,16
4,15,3,3,14


Combine numerical and categorical variables

In [71]:
data = np.concatenate([df_norm, df_cat], axis=1)

Split target and variables

In [72]:
from sklearn.model_selection import train_test_split

X = data[:train_end_idx, :]
y = response.values[:train_end_idx]

Smote

In [70]:
from imblearn.over_sampling import BorderlineSMOTE
smoter = BorderlineSMOTE(kind="borderline-1")
X_smote, y_smote = smoter.fit_resample(X, y)

# train

In [73]:
X_train, X_val, y_train, y_val = train_test_split(X_smote, y_smote, test_size=0.3)
X_test = data[train_end_idx:, :]

In [76]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape, X_test.shape

((44060, 20), (18884, 20), (44060,), (18884,), (34844, 20))

In [79]:
X_all = np.concatenate((X_train, X_val), axis=0)
y_all = np.concatenate((y_train, y_val), axis=0)
X_all.shape, y_all.shape

((62944, 20), (62944,))

### Decision Tree

In [80]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score

#train tree model
model = DecisionTreeClassifier()
model.fit(X_train,y_train)

#predict
y_pred_decision = model.predict(X_val)
print('Accuracy: %f' % accuracy_score(y_val, y_pred_decision))
print('f1-score: %f' % f1_score(y_val, y_pred_decision))

Accuracy: 0.869201
f1-score: 0.870287


### SVM

In [56]:
from sklearn.svm import SVC
model = SVC(kernel="linear", C=2)
model.fit(X_train, y_train)
#predict
y_pred_decision = model.predict(X_val)
print('Accuracy: %f' % accuracy_score(y_val, y_pred_decision))
print('f1-score: %f' % f1_score(y_val, y_pred_decision))

Accuracy: 0.566776
f1-score: 0.257420


### Random Forest

In [81]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

#predict
y_pred_decision = model.predict(X_val)
print('Accuracy: %f' % accuracy_score(y_val, y_pred_decision))
print('f1-score: %f' % f1_score(y_val, y_pred_decision))

Accuracy: 0.928140
f1-score: 0.927692


### Deep Learning

In [82]:
ans_pred = model.predict(X_test)
df_sap = pd.DataFrame(ans_pred.astype(int), columns = ['Weather'])
df_sap.to_csv('myAns.csv',  index_label = 'Id')