In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

In [None]:
df_ctemp = pd.read_csv('/kaggle/input/daily-temperature-of-major-cities/city_temperature.csv')
df_ctemp.head()

In [None]:
df_ctemp.shape

In [None]:
df_ctemp['Year'].unique()

Looks like there are some rows with invalid values in the Year column. Lets remove them.

In [None]:
df_ctemp = df_ctemp[~df_ctemp['Year'].isin(['201','200'])] 
df_ctemp.shape

In [None]:
df_ctemp['Year'].unique()

In [None]:
df_ctemp['Month'].unique()

In [None]:
df_ctemp['Day'].unique()

In [None]:
df_ctemp = df_ctemp[df_ctemp['Day'] != 0] 
df_ctemp.shape

Lets create the Date column from available Month, Day and Year columns as Date columns comes in handy with analysis.

In [None]:
df_ctemp['Date'] = df_ctemp['Year'].astype(str) + '-' + df_ctemp['Month'].astype(str) + '-' + df_ctemp['Day'].astype(str)
df_ctemp.head()

Now lets convert the Date column to the datetime format for it to be actually useful.

In [None]:
df_ctemp['Date'] = pd.to_datetime(df_ctemp['Date'])
df_ctemp.info()

Lets check if we have any NAN values.

In [None]:
df_ctemp.isna().sum()

Lets do a deep dive to understand these NAN values in State column.

In [None]:
df_ctemp[['Region','Country','State','City','Month','Day','Year','AvgTemperature']].loc[(df_ctemp['State'].notnull())]

In [None]:
df_ctemp['State'].unique()

Looks like State data is available only for US, which is fine but lets convert the remaining NANs in this column to NA to make it logical.

In [None]:
df_ctemp['State'].fillna('NA', inplace=True)

In [None]:
df_ctemp.isna().sum()

In [None]:
df_ctemp.describe()

Looking at the temperature data, we know it is in Fahrenheit and will convert it to Celcius for better understanding.

But before we do that we need to take care of the invalid data -99 F which is definitely a data issue. Instead of removing it, lets convert to NAN and replace by forward fill.

In [None]:
df_ctemp = df_ctemp.replace([-99.00], np.nan)
df_ctemp.isna().sum()

In [None]:
df_ctemp['AvgTemperature'] = df_ctemp['AvgTemperature'].fillna(method = 'ffill')
df_ctemp.isna().sum()

In [None]:
df_ctemp.shape

Now lets see the temperature in Celcius.

In [None]:
df_ctemp['AvgTemperature'] = round((((df_ctemp['AvgTemperature'] - 32) * 5) / 9),2)
df_ctemp.head()

Lets check the spread once again to see if we have any other data issues

In [None]:
df_ctemp.describe()

Dataset looks good now. Lets visualize Avg. Temperature across all cities and years.

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(15, 6))
bins = [-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 50]

sns.distplot(df_ctemp['AvgTemperature'], bins= bins, color="steelblue")

mean_temp = np.mean(df_ctemp['AvgTemperature'])
plt.axvline(mean_temp, label= 'Mean Avg. Temp.', color= 'green')

median_temp = np.median(df_ctemp['AvgTemperature'])
plt.axvline(median_temp, label= 'Median Avg. Temp.', color= 'red')

plt.legend()
plt.title('Global Avg. Temp. Distribution')
plt.xlabel('Avg. Temp. (in Celcius)')
plt.xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 50], 
           ['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '50'])
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
print('Skewness: ', df_ctemp['AvgTemperature'].skew())
print('Kurtosis: ', df_ctemp['AvgTemperature'].kurtosis())

As we can see from the histogram as well as the Skewness value, the dataset is negatively skewed moderately. Also since the dataset is platykurtic, we don't expect to see a lot of outliers. However, the situation might change if we look at the region or country level.

Most of the temperatures across the world and time of the year is concentrated between 20 and 30 C.

Lets now see how is the mean average temperature is changing over time.

In [None]:
world_temp = pd.Series(round(df_ctemp.groupby("Date")["AvgTemperature"].mean().sort_values() ,2))

sns.set_style("whitegrid")
plt.figure(figsize=(18,8))

sns.lineplot(data = world_temp, color = "green")
plt.xlabel("Time")

plt.ylabel('Temperature (in Celcius)')
plt.title('World Mean Avg. Temperature Over Time')
plt.show()

The mean highest temperature has more or less remained fixed at around 25 C while the mean lowest temperatures did vary meaning the winters are getting warmer.

Though the above visualization is interesting, we cannot interpret anything meaningful further because the world is geographically diverse and the temperatures will surely vary across geographies and time of the year.

Perhaps, we should zoom in to regions to get a better sense of what is going on.

In [None]:
region_temp = pd.Series(round(df_ctemp.groupby('Region')['AvgTemperature'].mean().sort_values(),2))
#Select the style of the plot
style.use('ggplot')
region_temp.plot(kind='barh',
                 figsize=(10, 5),
                 color='blue',
                 alpha=1)
plt.xlabel('Mean Avg. Temperature')
plt.ylabel('Region')
plt.title('Mean Avg. Temperature By Region')
plt.show()

No surprises here, irrespective of the time of the year the mean average temperature of Africa is a sharp contrast to Europe whereas Asis is somewhere in between.

Lets visualize other parameters for these regions as well.

In [None]:
sns.set_style("darkgrid")
plt.figure(figsize=(20, 8))

sns.boxplot(x= df_ctemp['Region'], y= df_ctemp['AvgTemperature'])

plt.xlabel('Region')
plt.ylabel('Temperature (in Celcius) Spread')

#plt.xticks(rotation= 20)


plt.title('World Temperature (in Celcius) Spread')
plt.show()

There seems to be a lot of outliers especially on the minimum temperature side for all the regions. This could be because of the -99 F we treated above.

In [None]:
df_asia = df_ctemp[df_ctemp['Region'] == 'Asia']
df_asia

In [None]:
asia_temp = pd.Series(round(df_asia.groupby('Date')['AvgTemperature'].mean().sort_values(),2))

sns.set_style("darkgrid")
plt.figure(figsize=(18, 6))

sns.lineplot(data= asia_temp, color= 'blue')
plt.xlabel('Time')

plt.ylabel('Temperature (in Celcius)')
plt.title('Asia Mean Avg. Temperature Over Time')
plt.show()

Looks like Asia follows the overall world trend, where the winters are becoming warmer whereas the summer is more or less the same.

Lets deep dive into India.

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(20, 8))

sns.boxplot(x= df_asia['Country'], y= df_ctemp['AvgTemperature'])

plt.xlabel('Country')
plt.xticks(rotation= 20)
plt.ylabel('Temperature (in Celcius) Spread')

plt.title('Asia Temperature (in Celcius) Spread')
plt.show()

In [None]:
df_bd = df_ctemp[df_ctemp['Country'] == 'Bangladesh']
df_bd

In [None]:
df_bd['City'].unique()

In [None]:
g = sns.FacetGrid(df_bd, col= 'City', col_wrap= 2, palette= "Set3", height= 4, aspect= 3, margin_titles=True)
g.map(sns.pointplot,'Year','AvgTemperature')
g.set(yticks= [20, 25, 30, 35])

In [None]:
g = sns.FacetGrid(df_bd, col= 'City')
g.map(sns.distplot, 'AvgTemperature', rug=False)
g.add_legend()

In [None]:
sns.set_style("darkgrid")
plt.figure(figsize=(12, 6))

sns.boxplot(x= df_bd['City'], y= df_ctemp['AvgTemperature'])

plt.xlabel('City')
plt.xticks(rotation= 20)
plt.ylabel('Temperature (in Celcius) Spread')
plt.title('bd Cities Temperature (in Celcius) Spread')
plt.show()

In [None]:
df_dhaka = df_bd[df_bd['City'] == 'Dhaka']
df_dhaka_winter = df_dhaka[df_dhaka['Month'] == 1]
df_dhaka_winter = df_dhaka_winter[['Month','Day','Year','AvgTemperature']]
df_dhaka_winter

In [None]:
sns.set_style("darkgrid")
plt.figure(figsize=(20, 6))

df_dhaka_winter.groupby(['Year','Month'])['AvgTemperature'].mean().plot()
plt.xlabel('Year-Month')

plt.ylabel('Temperature (in Celcius)')
plt.title('Delhi Temperature Over Time')
plt.show()

In [None]:
df_dhaka_summer = df_dhaka[df_dhaka['Month'] == 5]
df_dhaka_summer = df_dhaka_summer[['Month','Day','Year','AvgTemperature']]
df_dhaka_summer

In [None]:
sns.set_style("darkgrid")
plt.figure(figsize=(20, 6))

df_dhaka_summer.groupby(['Year','Month'])['AvgTemperature'].mean().plot()

plt.xlabel('Year-Month')
plt.ylabel('Temperature (in Celcius)')
plt.title('Delhi Temperature Over Time')
plt.show()

As we can see, for a sample month of January which is usually the coldest month in Dhaka, the mean temperature has slowly risen over the years. Similarly, the summers have become hotter as well. Sure sign of global warming!

**ANOTHER NOTEBOOK**

In [None]:
temp = pd.read_csv('/kaggle/input/daily-temperature-of-major-cities/city_temperature.csv')
temp = temp[temp['AvgTemperature'] != -99]

temp

Let's visualize distribution of Average Temperature and answer some questions

In [None]:
sns.distplot(temp['AvgTemperature'],kde=True, color = "blue")

1. Mean Temperature of Planet Earth

In [None]:
temp['AvgTemperature'].mean()

2. Hottest Place on Earth

In [None]:
temp[temp["AvgTemperature"] == max(temp["AvgTemperature"])]

3. Coldest Place on Earth

In [None]:
temp[temp['AvgTemperature'] == min(temp['AvgTemperature'])]

4. Rank Continent by Average Temperature

In [None]:
plt.figure(figsize=(8,3))
region_stats = temp.groupby('Region')['AvgTemperature'].agg(mean_temp='mean',std_temp = 'std',min_temp = 'min',max_temp = 'max',median_temp = 'median').reset_index().sort_values('mean_temp',ascending=False)

sns.barplot(x='mean_temp',y='Region',data=region_stats)

plt.xlabel("Mean Temperature",fontsize=12)
plt.ylabel('Continents',fontsize=12)
plt.title("Mean Temperature by Continents",fontsize=16)

5. Which Continent has huge variation in Average Temperature?

In [None]:
region_stats['cv'] = region_stats['std_temp'] / region_stats['mean_temp']
region_stats = region_stats.sort_values('cv',ascending=False)

plt.figure(figsize=(8,3))
sns.barplot(x='cv',y='Region',data=region_stats)

plt.xlabel("Coefficent of Variation - Temperature",fontsize=12)
plt.ylabel('Continents',fontsize=12)
plt.title("Coefficent of Variation by Continents",fontsize=16)

8. Top 20 Cities with huge variation in temperature

In [None]:
city_stats['cv'] = city_stats['std_temp'] / city_stats['mean_temp']
city_stats = city_stats.sort_values('cv',ascending=False).head(20)

plt.figure(figsize=(12,6))
sns.barplot(x='cv',y='City',data=city_stats)

plt.xlabel("Coefficient of Variation - Temperature",fontsize=12)
plt.ylabel('City',fontsize=12)
plt.title("Coefficent of Variation by Cities",fontsize=16)

In [None]:
city_stats[city_stats['City']=='Fairbanks']

City FairBanks has minimum temperature of -50 and max temperature of 79.5. Wow ! too much variation

6. Top 20 Hottest Cities

In [None]:
plt.figure(figsize=(12,6))
city_stats = temp.groupby('City')['AvgTemperature'].agg(mean_temp='mean',std_temp = 'std',min_temp = 'min',max_temp = 'max',median_temp = 'median').reset_index().sort_values('mean_temp',ascending=False).head(20)

sns.barplot(x='mean_temp',y='City',data=city_stats)

plt.xlabel("Mean Temperature",fontsize=12)
plt.ylabel('City',fontsize=12)
plt.title("Top 20 Hottest Cities",fontsize=16)

7. Top 20 Coldest Cities

In [None]:
plt.figure(figsize=(12,6))
city_stats = temp.groupby('City')['AvgTemperature'].agg(mean_temp='mean',std_temp = 'std',min_temp = 'min',max_temp = 'max',median_temp = 'median').reset_index().sort_values('mean_temp').head(20)

sns.barplot(x='mean_temp',y='City',data=city_stats)

plt.xlabel("Mean Temperature",fontsize=12)
plt.ylabel('City',fontsize=12)
plt.title("Top 20 Coldest Cities",fontsize=16)

9. Monthwise Mean Temperature in North America

In [None]:
plt.figure(figsize=(10,6))

America = temp[temp['Region']=='North America']
america_stats = America.groupby('Month')['AvgTemperature'].agg(mean_temp='mean').reset_index()

sns.barplot(x='Month',y='mean_temp',data=america_stats)
plt.xlabel("Month",fontsize=12)
plt.ylabel('Temperature',fontsize=12)
plt.title("Mean Temperature by Month",fontsize=16)

10. Monthwise Mean Temperature in Europe

In [None]:
plt.figure(figsize=(10,6))

Europe = temp[temp['Region']=='Europe']
Europe_stats = Europe.groupby('Month')['AvgTemperature'].agg(mean_temp='mean').reset_index()

sns.barplot(x='Month',y='mean_temp',data=Europe_stats)
plt.xlabel("Month",fontsize=12)
plt.ylabel('Temperature',fontsize=12)
plt.title("Mean Temperature by Month",fontsize=16)

11. Monthwise Mean temperature in Asia

In [None]:
Asia = temp[temp['Region']=='Asia']
plt.figure(figsize=(10,6))

asia_stats = Asia.groupby('Month')['AvgTemperature'].agg(mean_temp='mean').reset_index()

sns.barplot(x='Month',y='mean_temp',data=asia_stats)

plt.xlabel("Month",fontsize=12)
plt.ylabel('Temperature',fontsize=12)
plt.title("Mean Temperature by Month",fontsize=16)

12. Monthwise Mean Temperature in Australia / South Pacific

In [None]:
Aus = temp[temp['Region']=='Australia/South Pacific']
plt.figure(figsize=(10,6))
aus_stats = Aus.groupby('Month')['AvgTemperature'].agg(mean_temp='mean').reset_index()
sns.barplot(x='Month',y='mean_temp',data=aus_stats)
plt.xlabel("Month",fontsize=12)
plt.ylabel('Temperature',fontsize=12)
plt.title("Mean Temperature by Month",fontsize=16)

13. Monthwise Mean Temperature in Africa

In [None]:
Africa = temp[temp['Region']=='Africa']
plt.figure(figsize=(10,6))
africa_stats = Africa.groupby('Month')['AvgTemperature'].agg(mean_temp='mean').reset_index()
sns.barplot(x='Month',y='mean_temp',data=africa_stats)
plt.xlabel("Month",fontsize=12)
plt.ylabel('Temperature',fontsize=12)
plt.title("Mean Temperature by Month",fontsize=16)

14. Monthwise Temperature in Middle East

In [None]:
Middle_East = temp[temp['Region']=='Middle East']
plt.figure(figsize=(10,6))
middle_east_stats = Middle_East.groupby('Month')['AvgTemperature'].agg(mean_temp='mean').reset_index()
sns.barplot(x='Month',y='mean_temp',data=middle_east_stats)
plt.xlabel("Month",fontsize=12)
plt.ylabel('Temperature',fontsize=12)
plt.title("Mean Temperature by Month",fontsize=16)

16. Did Covid-19 had any impact on Global Warming? Refresh the below chart with the data for 2020 by the end of this year ...

In [None]:
plt.figure(figsize=(12,6))
temp.head()

year_stats = temp.groupby(['Year','Region'])['AvgTemperature'].agg(mean_temp='mean').reset_index()
year_stats.head()

sns.lineplot(x='Year',y='mean_temp',hue='Region',data=year_stats)

plt.xlabel("Year",fontsize=12)
plt.ylabel('Temperature',fontsize=12)
plt.title("Mean Temperature by Year",fontsize=16)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)