In [None]:
# importing the required libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics

In [None]:
# reading the data
data = pd.read_csv("rainfall.csv")
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data["SUBDIVISION"].value_counts()

In [None]:
numeric_cols = data.select_dtypes(include=[np.number]).columns
means = data[numeric_cols].mean()
means

In [None]:
# filling na values with mean
data[numeric_cols] = data[numeric_cols].fillna(means)
data.head(3)

In [None]:
data.isnull().any()

In [None]:
data.YEAR.unique()

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
data[["SUBDIVISION", "ANNUAL"]].groupby("SUBDIVISION").sum().sort_values(
    by="ANNUAL", ascending=False
).plot(kind="barh", stacked=True, figsize=(15, 10))

plt.xlabel("Rainfall in MM", size=12)
plt.ylabel("Sub-Division", size=12)
plt.title("Annual Rainfall v/s SubDivisions")
plt.grid(axis="x", linestyle="-.")
plt.show()

In [None]:
plt.figure(figsize=(15,8))

data.groupby("YEAR").sum()['ANNUAL'].plot(kind="line",color="r",marker=".")

plt.xlabel("YEARS",size=12)
plt.ylabel("RAINFALL IN MM",size=12)
plt.grid(axis="both",linestyle="-.")
plt.title("Rainfall over Years")
plt.show()

In [None]:
data[['YEAR', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL','AUG', 'SEP',
      'OCT', 'NOV', 'DEC']].groupby("YEAR").sum().plot(kind="line",figsize=(18,8))

plt.xlabel("Year",size=13)
plt.ylabel("Rainfall in MM",size=13)
plt.title("Year v/s Rainfall in each month",size=20)
plt.show()

In [None]:
data[["YEAR", "Jan-Feb", "Mar-May", "Jun-Sep", "Oct-Dec"]].groupby("YEAR").sum().plot(
    figsize=(10, 7)
)

plt.xlabel("Year", size=13)
plt.ylabel("Rainfall in MM", size=13)
plt.show()

In [None]:
data[['SUBDIVISION', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL',
       'AUG', 'SEP', 'OCT', 'NOV', 'DEC']].groupby("SUBDIVISION").sum().plot(kind="barh",stacked=True,figsize=(13,8))

plt.title("Sub-Division v/s Rainfall in each month")
plt.xlabel("Rainfall in MM",size=12)
plt.ylabel("Sub-Division",size=12)
plt.grid(axis="x",linestyle="-.")
plt.show()

In [None]:
data[["SUBDIVISION", "Jan-Feb", "Mar-May", "Jun-Sep", "Oct-Dec"]].groupby(
    "SUBDIVISION"
).sum().plot(kind="barh", stacked=True, figsize=(16, 8))

plt.xlabel("Rainfall in MM", size=12)
plt.ylabel("Sub-Division", size=12)
plt.grid(axis="x", linestyle="-.")
plt.show()

In [None]:
# analysis of rainfall data of west bengal
WestBengal = data.loc[((data['SUBDIVISION'] == 'GANGETIC WEST BENGAL'))]
WestBengal.head()

In [None]:
plt.figure(figsize=(10,6))
WestBengal[['JAN', 'FEB', 'MAR', 'APR','MAY', 'JUN','JUL','AUG', 'SEP', 'OCT','NOV','DEC']].mean().plot(kind="bar",width=0.5,linewidth=2)
plt.title("West Bengal Rainfall v/s Months",size=20)
plt.xlabel("Months",size=14)
plt.ylabel("Rainfall in MM",size=14)
plt.grid(axis="both",linestyle="-.")
plt.show()

In [None]:
annual_mean = WestBengal.groupby("YEAR")['ANNUAL'].mean()

# Plot the data
annual_mean.plot(ylim=(50,1500), color='r', marker='o', linestyle='-', linewidth=2, figsize=(12,8))
plt.xlabel('Year', size=14)
plt.ylabel('Rainfall in MM', size=14)
plt.title('West Bengal Annual Rainfall from Year 1901 to 2015', size=20)
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.heatmap(data[['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC','ANNUAL']].corr(),annot=True)
plt.show()

In [None]:
data["SUBDIVISION"].nunique()

In [None]:
group = data.groupby("SUBDIVISION")[
    ["YEAR", "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]
]
data=group.get_group(('GANGETIC WEST BENGAL'))
data.head()

In [None]:
df = data.melt(["YEAR"]).reset_index()
df.head()

In [None]:
df = df[["YEAR", "variable", "value"]].reset_index().sort_values(by=["YEAR", "index"])
df.head()

In [None]:
df.YEAR.unique()

In [None]:
df.columns = ["Index", "Year", "Month", "Avg_Rainfall"]
df.head()

In [None]:
Month_map = {
    "JAN": 1,
    "FEB": 2,
    "MAR": 3,
    "APR": 4,
    "MAY": 5,
    "JUN": 6,
    "JUL": 7,
    "AUG": 8,
    "SEP": 9,
    "OCT": 10,
    "NOV": 11,
    "DEC": 12,
}
df["Month"] = df["Month"].map(Month_map)
df.head(12)

In [None]:
df.drop(columns="Index", inplace=True)
df.head(2)

In [None]:
df.groupby("Year").sum().plot()
plt.show()

In [None]:
X = np.asanyarray(df[["Year", "Month"]]).astype("int")
y = np.asanyarray(df["Avg_Rainfall"]).astype("int")
print(X.shape)
print(y.shape)

In [None]:
# splitting the dataset into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

## Liner Regression Model

In [None]:
from sklearn.linear_model import LinearRegression

LR = LinearRegression()
LR.fit(X_train, y_train)

In [None]:
# predicting 
y_train_predict=LR.predict(X_train)
y_test_predict=LR.predict(X_test)

In [None]:
print("-------Test Data--------")
print("MAE:", metrics.mean_absolute_error(y_test, y_test_predict))
print("MSE:", metrics.mean_squared_error(y_test, y_test_predict))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, y_test_predict)))

print("\n-------Train Data--------")
print("MAE:", metrics.mean_absolute_error(y_train, y_train_predict))
print("MSE:", metrics.mean_squared_error(y_train, y_train_predict))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_train, y_train_predict)))

print("\n-----Training Accuracy-------")
print(round(LR.score(X_train, y_train), 3) * 100)
print("-----Testing Accuracy--------")
print(round(LR.score(X_test, y_test), 3) * 100)

## Lasso Model

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# create a lasso object
lasso = Lasso(max_iter=100000)

# check for best alpha value using GridSearch
parameter = {
    "alpha": [1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7]
}
lasso_regressor = GridSearchCV(lasso, parameter, scoring="neg_mean_squared_error", cv=5)

lasso_regressor.fit(X_train, y_train)

In [None]:
print("Best Parameter for Lasso:", lasso_regressor.best_estimator_)

In [None]:
lasso = Lasso(alpha=100.0, max_iter=100000)
# fit into the object
lasso.fit(X_train, y_train)

In [None]:
# predicting 
y_train_predict=lasso.predict(X_train)
y_test_predict=lasso.predict(X_test)

In [None]:
from sklearn import metrics
print("-------Test Data--------")
print('MAE:', metrics.mean_absolute_error(y_test, y_test_predict))
print('MSE:', metrics.mean_squared_error(y_test, y_test_predict))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_test_predict)))

print("\n-------Train Data--------")
print('MAE:', metrics.mean_absolute_error(y_train,y_train_predict))
print('MSE:', metrics.mean_squared_error(y_train, y_train_predict))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_train, y_train_predict)))

print("\n-----Training Accuracy-------")
print(round(lasso.score(X_train,y_train),3)*100)
print("-----Testing Accuracy--------")
print(round(lasso.score(X_test,y_test),3)*100)

## Ridge Model

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

ridge = Ridge()
parameters = {
    "alpha": [1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20, 30, 35, 40, 45, 50, 55, 100]
}
ridge_regressor = GridSearchCV(
    ridge, parameters, scoring="neg_mean_squared_error", cv=5
)
ridge_regressor.fit(X_train, y_train)

print(ridge_regressor.best_params_)
print(ridge_regressor.best_score_)
print("Best Parameter for Ridge:", ridge_regressor.best_estimator_)

In [None]:
ridge=Ridge(alpha=100.0)

# fit into the object
ridge.fit(X_train,y_train)

In [None]:
# predicting the train and test values
y_train_predict = ridge.predict(X_train)
y_test_predict = ridge.predict(X_test)

In [None]:
from sklearn import metrics

print("-------Test Data--------")
print("MAE:", metrics.mean_absolute_error(y_test, y_test_predict))
print("MSE:", metrics.mean_squared_error(y_test, y_test_predict))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, y_test_predict)))

print("\n-------Train Data--------")
print("MAE:", metrics.mean_absolute_error(y_train, y_train_predict))
print("MSE:", metrics.mean_squared_error(y_train, y_train_predict))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_train, y_train_predict)))

print("\n-----Training Accuracy-------")
print(round(ridge.score(X_train, y_train), 3) * 100)
print("-----Testing Accuracy--------")
print(round(ridge.score(X_test, y_test), 3) * 100)

## SVM Model

In [None]:
from sklearn import preprocessing, svm

svm_regr = svm.SVC(kernel="rbf")
svm_regr.fit(X_train, y_train)

y_test_predict = svm_regr.predict(X_test)
y_train_predict = svm_regr.predict(X_train)

In [None]:
from sklearn import metrics
print("-------Test Data--------")
print('MAE:', metrics.mean_absolute_error(y_test, y_test_predict))
print('MSE:', metrics.mean_squared_error(y_test, y_test_predict))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_test_predict)))

print("\n-------Train Data--------")
print('MAE:', metrics.mean_absolute_error(y_train,y_train_predict))
print('MSE:', metrics.mean_squared_error(y_train, y_train_predict))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_train, y_train_predict)))


print("\n-----Training Accuracy-------")
print(round(svm_regr.score(X_train,y_train),3)*100)
print("-----Testing Accuracy--------")
print(round(svm_regr.score(X_test,y_test),3)*100)

## Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestRegressor

random_forest_model = RandomForestRegressor(
    max_depth=100,
    max_features="sqrt",
    min_samples_leaf=4,
    min_samples_split=10,
    n_estimators=800,
)
random_forest_model.fit(X_train, y_train)

In [None]:
y_train_predict = random_forest_model.predict(X_train)
y_test_predict = random_forest_model.predict(X_test)

In [None]:
print("-------Test Data--------")
print("MAE:", metrics.mean_absolute_error(y_test, y_test_predict))
print("MSE:", metrics.mean_squared_error(y_test, y_test_predict))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, y_test_predict)))

print("\n-------Train Data--------")
print("MAE:", metrics.mean_absolute_error(y_train, y_train_predict))
print("MSE:", metrics.mean_squared_error(y_train, y_train_predict))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_train, y_train_predict)))

In [None]:
print("-----------Training Accuracy------------")
print(round(random_forest_model.score(X_train, y_train), 3) * 100)
print("-----------Testing Accuracy------------")
print(round(random_forest_model.score(X_test, y_test), 3) * 100)

In [None]:
predicted = random_forest_model.predict([[2016, 11]])
predicted