In [1]:
import numpy as np
import pandas as pd
import plotly.express as px # for interactive visualizations
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime ## time series analysis



In [2]:
df = pd.read_csv("Weather Data in India from 1901 to 2017.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC
0,0,1901,17.99,19.43,23.49,26.41,28.28,28.6,27.49,26.98,26.26,25.08,21.73,18.95
1,1,1902,19.0,20.39,24.1,26.54,28.68,28.44,27.29,27.05,25.95,24.37,21.33,18.78
2,2,1903,18.32,19.79,22.46,26.03,27.93,28.41,28.04,26.63,26.34,24.57,20.96,18.29
3,3,1904,17.77,19.39,22.95,26.73,27.83,27.85,26.84,26.73,25.84,24.36,21.07,18.84
4,4,1905,17.4,17.79,21.78,24.84,28.32,28.69,27.67,27.47,26.29,26.16,22.07,18.71


In [4]:
df = df.drop("Unnamed: 0", axis='columns')

In [5]:
df.head()

Unnamed: 0,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC
0,1901,17.99,19.43,23.49,26.41,28.28,28.6,27.49,26.98,26.26,25.08,21.73,18.95
1,1902,19.0,20.39,24.1,26.54,28.68,28.44,27.29,27.05,25.95,24.37,21.33,18.78
2,1903,18.32,19.79,22.46,26.03,27.93,28.41,28.04,26.63,26.34,24.57,20.96,18.29
3,1904,17.77,19.39,22.95,26.73,27.83,27.85,26.84,26.73,25.84,24.36,21.07,18.84
4,1905,17.4,17.79,21.78,24.84,28.32,28.69,27.67,27.47,26.29,26.16,22.07,18.71


In [6]:
# melt  is used to convert widde formated data set to  long formated one
df1 = pd.melt(df, id_vars='YEAR', value_vars=df.columns[1:])
# "id_vars" and "value_vars" are two important parameters that determine how the DataFrame should be reshaped.
#"id_vars" is a list of column names or index levels that should be kept in their original form and not melted down. 
#"value_vars" is a list of column names or index levels that should be melted down into a single column.
#"df.columns[1:]" is specified as the "value_vars" parameter, which means that all columns except the first one (i.e., "YEAR") will be melted down and their values will be stacked on top of each other in the "value" column of the resulting DataFrame.
df1.head()

Unnamed: 0,YEAR,variable,value
0,1901,JAN,17.99
1,1902,JAN,19.0
2,1903,JAN,18.32
3,1904,JAN,17.77
4,1905,JAN,17.4


In [7]:
df1['Date'] = df1['variable']+ ' '+ df1['YEAR'].astype(str)
#a new column called "Date" is added to the DataFrame "df1". This column is created by concatenating the "variable" column (which contains the melted column names) with the "YEAR" column converted to a string using the astype() method.
df1.loc[:,'Date'] = df1['Date'].apply(lambda x : datetime.strptime(x,'%b %Y'))
#he apply() method and a lambda function are used to convert the "Date" column of the DataFrame "df1" from a string format to a datetime format using the datetime.strptime() method.
# In this case, the format string '%b %Y' is used, which indicates that the input string is in the format "month year", where "month" is abbreviated to three letters (e.g., Jan, Feb, Mar) and "year" is in four-digit format (e.g., 2021). The method converts the input string to a datetime object with the month and year components set accordingly.....
#The resulting datetime object is then assigned back to the "Date" column of "df1". The loc[] method is used to select all rows and the "Date" column, and the assignment is made using the equals sign (=).
df1.head()

Unnamed: 0,YEAR,variable,value,Date
0,1901,JAN,17.99,1901-01-01
1,1902,JAN,19.0,1902-01-01
2,1903,JAN,18.32,1903-01-01
3,1904,JAN,17.77,1904-01-01
4,1905,JAN,17.4,1905-01-01


## Temparature Through Time


In [8]:
df1.columns=['Year', 'Month', 'Temprature', 'Date']
df1.sort_values(by='Date', inplace=True) ## To get the time series right. is used to sort the rows of a pandas DataFrame called df1 in ascending order based on the values in the 'Date' column.
#by: This specifies the column or list of columns by which the DataFrame should be sorted.
#inplace: This specifies whether to modify the DataFrame in place or return a new sorted DataFrame. If True, the original DataFrame will be modified. If False (default), a new sorted DataFrame will be returned and the original DataFrame will remain unchanged.
fig = go.Figure(layout = go.Layout(yaxis=dict(range=[0, df1['Temprature'].max()+1])))#creates a new Plotly figure object fig with a specified layout.
#The go.Layout function is used to define the layout properties of the figure. In this case, the yaxis property is set to a dictionary with the range key, which specifies the range of values to be displayed on the y-axis. The y-axis range is set to start from 0 and end at the maximum value of the 'Temperature' column in the DataFrame df1, plus 1.
fig.add_trace(go.Scatter(x=df1['Date'], y=df1['Temprature']), )#is used to add a trace to the Plotly figure object fig.
#The x argument is set to the values in the 'Date' column of df1, while the y argument is set to the values in the 'Temperature' column of df1.The add_trace method is used to add the go.Scatter trace to the fig object. The add_trace method takes the trace object as its argument.
fig.update_layout(title='Temprature Throught Timeline:',
                 xaxis_title='Time', yaxis_title='Temprature in Degrees')
##The update_layout method is used to modify the layout properties of the figure. In this case, the method is used to set the titles
##The update_layout method takes several arguments that can be used to modify various aspects of the plot layout, such as fonts, colors, margins, and annotations.
fig.update_layout(xaxis=go.layout.XAxis(
    rangeselector=dict(
        buttons=list([dict(label="Whole View", step="all"),
                      dict(count=1,label="One Year View",step="year",stepmode="todate")                      
                     ])),
        rangeslider=dict(visible=True),type="date")
)
#This code defines a range selector and range slider for the x-axis of the plot. The range selector allows the user to select a range of dates to display, while the range slider allows the user to zoom in and out of the plot.
fig.show()


In [10]:
fig = px.box(df1, 'Month', 'Temprature')
fig.update_layout(title='Warmest, Coldest and Median Monthly Tempratue.')
fig.show()

In [11]:
from sklearn.cluster import KMeans
sse = []
target = df1['Temprature'].to_numpy().reshape(-1,1)
num_clusters = list(range(1, 10))

for k in num_clusters:
    km = KMeans(n_clusters=k)
    km.fit(target)
    sse.append(km.inertia_)

fig = go.Figure(data=[
    go.Scatter(x = num_clusters, y=sse, mode='lines'),
    go.Scatter(x = num_clusters, y=sse, mode='markers')
])

fig.update_layout(title="Evaluation on number of clusters:",
                 xaxis_title = "Number of Clusters:",
                 yaxis_title = "Sum of Squared Distance",
                 showlegend=False)
fig.show()




AttributeError: 'NoneType' object has no attribute 'split'