# Project : Work with 65 years dataset for Rainfall Analysis in Bangladesh.

In [2]:
# importing the necessary libraries

import pandas as pd
import gc
import numpy as np
from sklearn.preprocessing import LabelEncoder , MinMaxScaler
from sklearn.model_selection import train_test_split , GridSearchCV , cross_val_score , KFold
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from xgboost import plot_importance

In [4]:
# load the dataset

df = pd.read_csv('65 Years of Weather Data Bangladesh (1948 - 2013).csv')
print('Dataset Size is {0} MB'.format(df.memory_usage().sum()/1024**2))

Dataset Size is 2.7393798828125 MB


In [5]:
df.isnull().sum()

Station Names        0
YEAR                 0
Month                0
Max Temp             0
Min Temp             0
Rainfall             0
Relative Humidity    0
Wind Speed           0
Cloud Coverage       0
Bright Sunshine      0
Station Number       0
X_COR                0
Y_COR                0
LATITUDE             0
LONGITUDE            0
ALT                  0
Period               0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21120 entries, 0 to 21119
Data columns (total 17 columns):
Station Names        21120 non-null object
YEAR                 21120 non-null int64
Month                21120 non-null int64
Max Temp             21120 non-null float64
Min Temp             21120 non-null float64
Rainfall             21120 non-null float64
Relative Humidity    21120 non-null float64
Wind Speed           21120 non-null float64
Cloud Coverage       21120 non-null float64
Bright Sunshine      21120 non-null float64
Station Number       21120 non-null int64
X_COR                21120 non-null float64
Y_COR                21120 non-null float64
LATITUDE             21120 non-null float64
LONGITUDE            21120 non-null float64
ALT                  21120 non-null int64
Period               21120 non-null float64
dtypes: float64(12), int64(4), object(1)
memory usage: 2.7+ MB


In [8]:
df.head().transpose()

Unnamed: 0,0,1,2,3,4
Station Names,Barisal,Barisal,Barisal,Barisal,Barisal
YEAR,1949,1950,1951,1952,1953
Month,1,1,1,1,1
Max Temp,29.4,30,28.2,26.6,30
Min Temp,12.3,14.1,12.3,12.3,13.3
Rainfall,0,0,0,2,10
Relative Humidity,68,77,77,77,75
Wind Speed,0.453704,0.453704,0.453704,0.453704,0.453704
Cloud Coverage,0.6,0.8,0.6,1,1.6
Bright Sunshine,7.83191,7.83191,7.83191,7.83191,7.83191


In [9]:
# # delete some columns
# new_df = df.drop(['Max Temp' , 'Min Temp' , 'Bright Sunshine' , 'Period'] , axis = 1)
# print('Dataset Size is {0} MB'.format(new_df.memory_usage().sum()/1024**2)

In [10]:
df.columns

Index(['Station Names', 'YEAR', 'Month', 'Max Temp', 'Min Temp', 'Rainfall',
       'Relative Humidity', 'Wind Speed', 'Cloud Coverage', 'Bright Sunshine',
       'Station Number', 'X_COR', 'Y_COR', 'LATITUDE', 'LONGITUDE', 'ALT',
       'Period'],
      dtype='object')

In [13]:
labelencoder_X = LabelEncoder()
df['Station Names']= labelencoder_X.fit_transform(df['Station Names'])

In [14]:
df.head()

Unnamed: 0,Station Names,YEAR,Month,Max Temp,Min Temp,Rainfall,Relative Humidity,Wind Speed,Cloud Coverage,Bright Sunshine,Station Number,X_COR,Y_COR,LATITUDE,LONGITUDE,ALT,Period
0,0,1949,1,29.4,12.3,0.0,68.0,0.453704,0.6,7.831915,41950,536809.8,510151.9,22.7,90.36,4,1949.01
1,0,1950,1,30.0,14.1,0.0,77.0,0.453704,0.8,7.831915,41950,536809.8,510151.9,22.7,90.36,4,1950.01
2,0,1951,1,28.2,12.3,0.0,77.0,0.453704,0.6,7.831915,41950,536809.8,510151.9,22.7,90.36,4,1951.01
3,0,1952,1,26.6,12.3,2.0,77.0,0.453704,1.0,7.831915,41950,536809.8,510151.9,22.7,90.36,4,1952.01
4,0,1953,1,30.0,13.3,10.0,75.0,0.453704,1.6,7.831915,41950,536809.8,510151.9,22.7,90.36,4,1953.01


#### Divided the dataset into train and test.

In [16]:
df_train, df_test = train_test_split(df , train_size = 0.75, test_size = 0.25, random_state = 80)

In [17]:
df_train

Unnamed: 0,Station Names,YEAR,Month,Max Temp,Min Temp,Rainfall,Relative Humidity,Wind Speed,Cloud Coverage,Bright Sunshine,Station Number,X_COR,Y_COR,LATITUDE,LONGITUDE,ALT,Period
17095,27,1997,7,32.7,24.1,983.0,92.000000,2.300000,6.6,3.300000,41964,650012.1,488627.9,22.50,91.46,6,1997.07
20004,32,1956,7,35.0,24.6,872.0,87.245614,1.512963,7.1,3.719231,41891,694533.2,752277.9,24.88,91.93,35,1956.07
14671,23,1995,10,34.5,24.8,289.0,85.000000,0.600000,3.8,6.100000,41960,534986.1,472575.7,22.36,90.34,3,1995.10
925,1,1967,4,36.4,22.7,80.0,78.000000,3.100000,3.0,6.948485,41951,567637.6,510271.8,22.70,90.66,5,1967.04
17325,27,1987,12,29.0,16.6,22.0,77.000000,0.700000,1.3,8.000000,41964,650012.1,488627.9,22.50,91.46,6,1987.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8714,13,1980,10,32.6,24.4,387.0,83.000000,1.300000,2.7,6.493103,41963,616159.2,465295.1,22.29,91.13,4,1980.10
2259,3,1975,3,36.1,19.6,1.0,70.000000,1.000000,0.8,6.889189,41941,568556.9,571945.8,23.26,90.67,7,1975.03
19366,30,2000,12,29.0,11.0,0.0,80.000000,0.300000,1.2,9.500000,41915,675761.6,687095.9,24.29,91.73,23,2000.12
5308,8,1994,10,34.0,24.0,88.0,79.000000,0.800000,2.9,6.900000,41992,705183.0,374324.6,21.46,91.98,4,1994.10


In [18]:
mm_scaler = MinMaxScaler()
features_names = ['Station Names' , 'YEAR' , 'Month' , 'Max Temp' , 'Min Temp' , 'Rainfall' , 'Relative Humidity' , 'Wind Speed' , 'Cloud Coverage' , 'Station Number' , 'X_COR' , 'Y_COR' , 'LATITUDE' , 'LONGITUDE' , 'ALT']
features_names

['Station Names',
 'YEAR',
 'Month',
 'Max Temp',
 'Min Temp',
 'Rainfall',
 'Relative Humidity',
 'Wind Speed',
 'Cloud Coverage',
 'Station Number',
 'X_COR',
 'Y_COR',
 'LATITUDE',
 'LONGITUDE',
 'ALT']

In [21]:
df_train[features_names] = mm_scaler.fit_transform(df_train[features_names])
df_test[features_names] = mm_scaler.fit_transform(df_test[features_names])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.loc._setitem_with_indexer((slice(None), indexer), value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_array(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pyda

In [22]:
x_train = df_train
y_train = df_train.pop('Rainfall')


x_test = df_test
y_test = df_test.pop('Rainfall')