# I. Exploration

## 1. Data Loading and Visualisation

First we load some useful libraries for data loading and data visualization in Python

In [None]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import time

from math import *

In [None]:
def Load_DataSet(name, data_location = "data/"):
    return pd.read_csv(data_location + name + ".txt", sep = " ")

In [None]:
# Data loading
path = ""
data_location = "data/"
rain = Load_DataSet("rain_project", data_location = path + data_location)
# Let's take a look at the data
rain.head()

**Remark**:

In this database, we realise that the qualitative variables including "Id", "date", "rain_class". 

The other variables are considered quantitative including "ff","t", "td", "hu", "dd", "precip", "ws_arome", "p3031_arome", "u10_arome", "v10_arome", "t2m_arome", "d2m_arome", "r_arome", "tp_arome", "msl_arome", "rain"

## Data transformation

### Date to month

In [None]:
#Replace the column "date" into "month" to obtain the new data
from datetime import datetime, timedelta

rain["date"] = pd.to_datetime(rain["date"]).dt.month
rain = rain.rename(columns= {"date":"month"})

In [None]:
names = list(rain.columns)
num_var = names[2:-1]
qual_var = [names[i] for i in [0,1,-1]]

### Logarithm transformation ($\log(\cdot + 1)$)

In [None]:
rain_log = rain.copy()

rain_log["precip"] = np.log(rain_log["precip"] + 1)
rain_log["tp_arome"] = np.log(rain_log["tp_arome"] + 1)
rain_log["rain_log"] = np.log(rain_log["rain"] + 1)

rain_log.rename(columns = {'precip':'precip_log', 'tp_arome':'tp_arome_log'}, inplace = True)

num_var_log = num_var
num_var_log = list(map(lambda item: item.replace("precip","precip_log"), num_var_log))
num_var_log = list(map(lambda item: item.replace("tp_arome","tp_arome_log"), num_var_log))

qual_var_log = qual_var + ["rain_log"]

## Rain data set presentation

## Brief description of the data sets

In [None]:
print(rain.describe())

In [None]:
print(rain_log.describe())

### Qualitative variable
#### Histogram of `month` variable

In [None]:
var = "month"

plt.figure()
plt.hist(rain[var], bins = 2*12-1)
plt.title("Histogram of "+var)
plt.xlabel(var+" values")
plt.ylabel("Number per interval")
plt.show()

### Quantitatives variables
#### Histograms

In [None]:
for var in num_var:
    plt.figure()
    plt.hist(rain[var], bins=50)
    plt.title("Histogram of "+var)
    plt.xlabel(var+" values")
    plt.ylabel("Number per interval")
plt.show()

In [None]:
for var in ["precip_log", "tp_arome_log", "rain_log"]:
    plt.figure()
    plt.hist(rain_log[var], bins=50)
    plt.title("Histogram of "+var)
    plt.xlabel(var+" values")
    plt.ylabel("Number per interval")
plt.show()

#### Correlation between variables
##### Classical data set

In [None]:
pd.plotting.scatter_matrix(rain[num_var], figsize=(12, 12))
plt.show()

In [None]:
rain_corr = rain[num_var].corr()

mask = np.zeros_like(rain_corr, dtype=np.bool_)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(12, 14))
cmap= 'coolwarm'
sns.heatmap(rain_corr, mask=mask, cmap=cmap, annot=True, vmax=.3, vmin=-.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})
ax.plot()

##### Logarithmical data set

In [None]:
pd.plotting.scatter_matrix(rain_log[num_var_log], figsize=(12, 12))
plt.show()

In [None]:
rain_log_corr = rain_log[num_var_log].corr()

mask = np.zeros_like(rain_log_corr, dtype=np.bool_)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(12, 14))
cmap= 'coolwarm'
sns.heatmap(rain_log_corr, mask=mask, cmap=cmap, annot=True, vmax=.3, vmin=-.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})
ax.plot()

## PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

In [None]:
pcaR = PCA()
loadingR = pd.DataFrame(scale(rain[num_var]), columns = rain[num_var].columns)
pca_DataSet = pcaR.fit(loadingR).transform(loadingR)

In [None]:
plt.figure(figsize = (10,5))
x = np.arange(pcaR.explained_variance_ratio_.size)
plt.bar(x, pcaR.explained_variance_ratio_*100)
plt.xlabel('Number of components')
plt.ylabel('Explained variance (%)')
plt.show()

In [None]:
plt.figure(figsize = (10,5))
x = np.arange(pcaR.explained_variance_ratio_.size)
plt.bar(x, pcaR.explained_variance_ratio_.cumsum()*100)
plt.plot(x, np.zeros(x.shape)+95, color  ="red")
plt.xlabel('Number of components')
plt.ylabel('Cumulative summation of explained variance (%)')
plt.show()

In [None]:
nb_PCA_components = 7

In [None]:
pca_DataSet = pd.DataFrame(pca_DataSet)
pca_DataSet["rain_class"] = rain["rain_class"].astype("category")

In [None]:
pca_DataSet.iloc[:,0:nb_PCA_components].plot(kind = "box", figsize = (15, 6) )
plt.xlabel('First %d-th principal components' % nb_PCA_components)
plt.show()

In [None]:
pca_DataSet.plot.scatter(x=0, y=1, c="rain_class", cmap="viridis", figsize = (10, 10))
plt.title('Individuals factor map - PCA')
plt.show()

In [None]:
coord1 = pcaR.components_[0] * np.sqrt(pcaR.explained_variance_[0])
coord2 = pcaR.components_[1] * np.sqrt(pcaR.explained_variance_[1])
fig = plt.figure(figsize = (10, 10))
ax = fig.add_subplot(1, 1, 1)
for i, j, nom in zip(coord1, coord2, loadingR.columns):
    plt.text(i, j, nom)
    plt.arrow(0, 0, i, j, color = 'r', width = 0.0001)
plt.axis((-1, 1, -1, 1))
#Cercle
c = plt.Circle((0, 0), radius = 1, color = 'b', fill = False)
ax.add_patch(c)
plt.title('Variables factor map - PCA')
plt.show()

## Data preparation

### Month class to dummies variables

In [None]:
rain_class = pd.get_dummies(rain, columns =  ['month'])
rain_log_class = pd.get_dummies(rain_log, columns =  ['month'])

### Data normalization

In [None]:
for var in num_var:
    rain[var] = (rain[var] - rain[var].mean()) / rain[var].std()
for var in num_var_log:
    rain_log[var] = (rain_log[var] - rain_log[var].mean()) / rain_log[var].std()

### Spliting the data into a training set and a test set

In [None]:
def SplitingData(train_set_rate = .8, random = False):
    if not(random):
        limit_train_test_set = ceil(train_set_rate*rain.shape[0])

        rain_train = rain.iloc[limit_train_test_set:, :]
        rain_test = rain.iloc[:limit_train_test_set, :]
    
    else:
        rain_train = rain.sample(frac = train_set_rate)
        rain_test = rain.drop(rain_train.index)
    return (rain_train, rain_test)

train_set_rate = .8 # 80 %
rain_train, rain_test = SplitingData(train_set_rate, random = False)

In [None]:
print(rain_train.head())
print(rain_train.describe())

In [None]:
print(rain_test.head())
print(rain_test.describe())

## Saves data sets

In [None]:
def Save_DataSet(df, name, data_location = "data/"):
    df.to_csv(data_location + name + ".txt", sep = " ")

In [None]:
Save_DataSet(rain_train, "train_set")
Save_DataSet(rain_test, "test_set")