In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv("Resources/GlobalLandTemperaturesByMajorCity.csv")
df.head()


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1849-01-01,26.704,1.435,Abidjan,Côte D'Ivoire,5.63N,3.23W
1,1849-02-01,27.434,1.362,Abidjan,Côte D'Ivoire,5.63N,3.23W
2,1849-03-01,28.101,1.612,Abidjan,Côte D'Ivoire,5.63N,3.23W
3,1849-04-01,26.14,1.387,Abidjan,Côte D'Ivoire,5.63N,3.23W
4,1849-05-01,25.427,1.2,Abidjan,Côte D'Ivoire,5.63N,3.23W


In [3]:
df.shape

(239177, 7)

In [4]:
df.dtypes

dt                                object
AverageTemperature               float64
AverageTemperatureUncertainty    float64
City                              object
Country                           object
Latitude                          object
Longitude                         object
dtype: object

In [5]:
#List of state names
city_names = df.City.unique()
city_names

array(['Abidjan', 'Addis Abeba', 'Ahmadabad', 'Aleppo', 'Alexandria',
       'Ankara', 'Baghdad', 'Bangalore', 'Bangkok', 'Belo Horizonte',
       'Berlin', 'Bogotá', 'Bombay', 'Brasília', 'Cairo', 'Calcutta',
       'Cali', 'Cape Town', 'Casablanca', 'Changchun', 'Chengdu',
       'Chicago', 'Chongqing', 'Dakar', 'Dalian', 'Dar Es Salaam',
       'Delhi', 'Dhaka', 'Durban', 'Faisalabad', 'Fortaleza', 'Gizeh',
       'Guangzhou', 'Harare', 'Harbin', 'Ho Chi Minh City', 'Hyderabad',
       'Ibadan', 'Istanbul', 'Izmir', 'Jaipur', 'Jakarta', 'Jiddah',
       'Jinan', 'Kabul', 'Kano', 'Kanpur', 'Karachi', 'Kiev', 'Kinshasa',
       'Lagos', 'Lahore', 'Lakhnau', 'Lima', 'London', 'Los Angeles',
       'Luanda', 'Madras', 'Madrid', 'Manila', 'Mashhad', 'Melbourne',
       'Mexico', 'Mogadishu', 'Montreal', 'Moscow', 'Nagoya', 'Nagpur',
       'Nairobi', 'Nanjing', 'New Delhi', 'New York', 'Paris', 'Peking',
       'Pune', 'Rangoon', 'Rio De Janeiro', 'Riyadh', 'Rome', 'São Paulo',
       'S

In [6]:
# list of country names
country_names = df.Country.unique()
country_names

array(["Côte D'Ivoire", 'Ethiopia', 'India', 'Syria', 'Egypt', 'Turkey',
       'Iraq', 'Thailand', 'Brazil', 'Germany', 'Colombia',
       'South Africa', 'Morocco', 'China', 'United States', 'Senegal',
       'Tanzania', 'Bangladesh', 'Pakistan', 'Zimbabwe', 'Vietnam',
       'Nigeria', 'Indonesia', 'Saudi Arabia', 'Afghanistan', 'Ukraine',
       'Congo (Democratic Republic Of The)', 'Peru', 'United Kingdom',
       'Angola', 'Spain', 'Philippines', 'Iran', 'Australia', 'Mexico',
       'Somalia', 'Canada', 'Russia', 'Japan', 'Kenya', 'France', 'Burma',
       'Italy', 'Chile', 'Dominican Republic', 'South Korea', 'Singapore',
       'Taiwan', 'Sudan'], dtype=object)

In [7]:
df.AverageTemperature.isna().value_counts()

False    228175
True      11002
Name: AverageTemperature, dtype: int64

In [8]:
#Percentage of null values
11002/228175

0.048217377013257366

In [9]:
df_clean = df

# Change dt column to a date time object

In [10]:
df_clean.dt = pd.to_datetime(df_clean.dt)


In [11]:
df_clean.dtypes

dt                               datetime64[ns]
AverageTemperature                      float64
AverageTemperatureUncertainty           float64
City                                     object
Country                                  object
Latitude                                 object
Longitude                                object
dtype: object

In [12]:
df2 = df_clean[df_clean["dt"] >= "1921"]

# Removing null cells from Average Temperature

In [13]:
df2.AverageTemperature.isna().value_counts()

False    111207
True         93
Name: AverageTemperature, dtype: int64

In [14]:
93/111207

0.0008362782918341471

In [15]:
df2.AverageTemperature = df2.AverageTemperature.fillna(method='bfill')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [16]:
df2.AverageTemperature.isna().value_counts()

False    111299
True          1
Name: AverageTemperature, dtype: int64

In [17]:
df2.AverageTemperature = df2.AverageTemperature.fillna(method='ffill')

In [18]:
df2.AverageTemperature.isna().value_counts()

False    111300
Name: AverageTemperature, dtype: int64

# Convert Average Temperature celsius to fahrenheit

In [19]:
df2

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
864,1921-01-01,26.686,0.924,Abidjan,Côte D'Ivoire,5.63N,3.23W
865,1921-02-01,28.659,1.065,Abidjan,Côte D'Ivoire,5.63N,3.23W
866,1921-03-01,27.794,0.985,Abidjan,Côte D'Ivoire,5.63N,3.23W
867,1921-04-01,27.579,1.091,Abidjan,Côte D'Ivoire,5.63N,3.23W
868,1921-05-01,26.512,0.455,Abidjan,Côte D'Ivoire,5.63N,3.23W
...,...,...,...,...,...,...,...
239172,2013-05-01,18.979,0.807,Xian,China,34.56N,108.97E
239173,2013-06-01,23.522,0.647,Xian,China,34.56N,108.97E
239174,2013-07-01,25.251,1.042,Xian,China,34.56N,108.97E
239175,2013-08-01,24.528,0.840,Xian,China,34.56N,108.97E


In [20]:
df2.AverageTemperature = df2["AverageTemperature"] *(9/5) + 32

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


# Find the top 5 cities with average temperature <= 32

In [21]:
city_names = df2.City.unique()
len(city_names)



100

In [22]:
df3 = df2[df2["AverageTemperature"] <= 32]

In [23]:
df3['City'].value_counts()

Harbin              459
Changchun           454
Moscow              430
Montreal            398
Saint Petersburg    386
Toronto             366
Shenyang            354
Kiev                323
Taiyuan             303
Dalian              264
Tianjin             251
Peking              251
Tangshan            225
Seoul               222
New York            208
Xian                173
Santiago            152
Jinan               135
Chicago             119
Ankara              109
Berlin               96
Mashhad              64
Chengdu              31
Tokyo                18
Paris                17
Kabul                16
London               10
Nanjing               4
Rome                  3
Wuhan                 2
Casablanca            1
Dakar                 1
Taipei                1
Jiddah                1
Shanghai              1
Harare                1
Madrid                1
São Paulo             1
New Delhi             1
Karachi               1
Mogadishu             1
Alexandria      

In [24]:
days_below_freezing = df3['City'].value_counts()

In [25]:
top_5_coldest = list(days_below_freezing[0:5].keys())

In [26]:
top_5_coldest

['Harbin', 'Changchun', 'Moscow', 'Montreal', 'Saint Petersburg']

In [27]:
df4 = df2.loc[df2['City'].isin(top_5_coldest)]

In [28]:
df4

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
48243,1921-01-01,6.9908,1.021,Changchun,China,44.20N,125.22E
48244,1921-02-01,11.9930,0.593,Changchun,China,44.20N,125.22E
48245,1921-03-01,24.1394,0.278,Changchun,China,44.20N,125.22E
48246,1921-04-01,43.6406,0.700,Changchun,China,44.20N,125.22E
48247,1921-05-01,59.3456,0.480,Changchun,China,44.20N,125.22E
...,...,...,...,...,...,...,...
196675,2013-05-01,54.2390,0.195,Saint Petersburg,Russia,60.27N,29.19E
196676,2013-06-01,62.9330,0.413,Saint Petersburg,Russia,60.27N,29.19E
196677,2013-07-01,63.0212,0.273,Saint Petersburg,Russia,60.27N,29.19E
196678,2013-08-01,62.8754,0.279,Saint Petersburg,Russia,60.27N,29.19E


# Export CSV to File

In [29]:
df4.to_csv("CleanedGlobalLandTemperaturesByMajorCity.csv")