In [1]:
# Perform imports
import pandas as pd
import numpy as np
from plotly.offline import plot, iplot, init_notebook_mode
import plotly.graph_objs as go
# Make plotly work within jupyter notebook
init_notebook_mode(connected=True)

# Import 'world-temperatures' csv
df = pd.read_csv('Data/world-temperatures.csv')

# Import city coordinates csv
columns = ['city','country','lat']
dflat = pd.read_csv('Data/city-lat-lon.csv',usecols=columns)

# Merge latitude with df using common city and country
merge = df.merge(dflat.drop_duplicates(subset=['city','country']),how='left',on=['city','country'])

# Create unique identifier column for when two countries share a city name
merge['ID'] = merge['city'] + merge['country']

In [2]:
# Are any cities missing latitude data?
print(merge[merge['lat'].isnull()]['ID'].unique().tolist())

["AbidjanCôte D'Ivoire", 'AhmadabadIndia', 'BenghaziLibya', 'ColomboBrazil', 'CopenhagenDenmark', 'CordobaArgentina', 'Dar Es SalaamTanzania', 'EcatepecMexico', 'Fort WorthUnited States', 'GuarulhosBrazil', 'Guatemala CityGuatemala', 'MeccaSaudi Arabia', 'MontrealCanada', 'Port Au PrinceHaiti', 'Rio De JaneiroBrazil', 'Santa CruzPhilippines', 'SantiagoPhilippines', 'Santo DomingoEcuador', 'ShenyangChina', 'SowetoSouth Africa', 'WashingtonUnited States', "YamoussoukroCôte D'Ivoire", 'ZapopanMexico']


In [3]:
# Fill NaN with appropriate latitude (Found from quick Google search)
values = {'Abidjan': 5.3,
          'Ahmadabad': 23.0,
          'Benghazi': 32.1,
          'Colombo': -25.3,
          'Copenhagen': 55.7,
          'Cordoba': -31.4,
          'Dar Es Salaam': -6.8,
          'Ecatepec': 19.6,
          'Fort Worth': 32.8,
          'Guarulhos': -23.5,
          'Guatemala City': 14.6,
          'Mecca': 21.4,
          'Montreal': 45.5,
          'Port Au Prince': 18.6,
          'Rio De Janeiro': -22.9,
          'Santa Cruz': 14.3,
          'Santiago': 16.7,
          'Santo Domingo': -0.2,
          'Shenyang': 41.8,
          'Soweto': -26.2,
          'Washington': 38.9,
          'Yamoussoukro': 6.8,
          'Zapopan': 20.7}
merge['lat'] = merge['lat'].fillna(merge['city'].map(values)).round(1)

In [4]:
merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70758 entries, 0 to 70757
Data columns (total 7 columns):
year            70758 non-null int64
city            70758 non-null object
country         70758 non-null object
avg_temp        68527 non-null float64
gbl_avg_temp    70758 non-null float64
lat             70758 non-null float64
ID              70758 non-null object
dtypes: float64(3), int64(1), object(3)
memory usage: 4.3+ MB


In [5]:
# Looks like there are some empty values as well as outliers that don't make sense.
# These can be replaced a for loop:

# Keep Data Structure in mind (list, array, df)

# What is it? How can I map it to the data structure? (for loop, list comp, .map)

# Create empty IDdf to populate
IDdf = pd.DataFrame()

# Unit algorithm for cities:
for ID in merge['ID'].unique():
    
    # Create city specific df. Must be explicit copy of original
    df = pd.DataFrame(merge[merge.loc[:,'ID'] == ID], copy=True)
    
    # Calculate mean and stddev of each city
    mean = df['avg_temp'].mean()
    std = df['avg_temp'].std()
    
    # Determine outliers that are outside three stddevs
    maxo = mean + 3*std
    mino = mean - 3*std
    
    # Edge cases: Should be addressed first so that looping
    # equations work properly
    if df.iloc[0,3] > maxo or df.iloc[0,3] < mino:
        df.iloc[0,3] = mean    
    
    # Unit algorithm for filling outliers
    for x in range(1,len(df)):    
        if df.iloc[x,3] > maxo or df.iloc[x,3] < mino:
            df.iloc[x,3] = df.iloc[x-1,3]

    # Ffill nulls before appending to avoid overlap between cities
    df['avg_temp'] = df['avg_temp'].fillna(method='ffill')
    IDdf = IDdf.append(df)
    
# Bfill remaining nulls (Should only be cases where first value is null)
IDdf['avg_temp'] = IDdf['avg_temp'].fillna(method='bfill')

IDdf.head()

Unnamed: 0,year,city,country,avg_temp,gbl_avg_temp,lat,ID
0,1849,Abidjan,Côte D'Ivoire,25.58,7.98,5.3,AbidjanCôte D'Ivoire
1,1850,Abidjan,Côte D'Ivoire,25.52,7.9,5.3,AbidjanCôte D'Ivoire
2,1851,Abidjan,Côte D'Ivoire,25.67,8.18,5.3,AbidjanCôte D'Ivoire
3,1852,Abidjan,Côte D'Ivoire,25.67,8.1,5.3,AbidjanCôte D'Ivoire
4,1853,Abidjan,Côte D'Ivoire,25.67,8.04,5.3,AbidjanCôte D'Ivoire


In [6]:
# Convert Celsius to Fahrenheit and create new columns
IDdf['avg_temp_f'] = IDdf['avg_temp'] * 1.8 + 32
IDdf['gbl_avg_temp_f'] = IDdf['gbl_avg_temp'] * 1.8 + 32

In [7]:
# Calculate 10-year rolling mean for individual cities
IDdf['r_mean'] = IDdf.groupby('ID')['avg_temp_f'].rolling(10).mean().reset_index(0,drop=True)

In [8]:
# Calculate 10-year rolling mean globally
gbldf = IDdf[['year','gbl_avg_temp_f']].sort_values('year').drop_duplicates().reset_index(0,drop=True)
gbldf['gbl_r_mean'] = gbldf['gbl_avg_temp_f'].rolling(10).mean().reset_index(0,drop=True)

In [9]:
# Merge back with IDdf
totdf = IDdf.merge(gbldf.drop_duplicates(subset=['year','gbl_avg_temp_f']),how='left',on=['year','gbl_avg_temp_f'])
totdf.head()

Unnamed: 0,year,city,country,avg_temp,gbl_avg_temp,lat,ID,avg_temp_f,gbl_avg_temp_f,r_mean,gbl_r_mean
0,1849,Abidjan,Côte D'Ivoire,25.58,7.98,5.3,AbidjanCôte D'Ivoire,78.044,46.364,,46.3604
1,1850,Abidjan,Côte D'Ivoire,25.52,7.9,5.3,AbidjanCôte D'Ivoire,77.936,46.22,,46.3784
2,1851,Abidjan,Côte D'Ivoire,25.67,8.18,5.3,AbidjanCôte D'Ivoire,78.206,46.724,,46.4666
3,1852,Abidjan,Côte D'Ivoire,25.67,8.1,5.3,AbidjanCôte D'Ivoire,78.206,46.58,,46.481
4,1853,Abidjan,Côte D'Ivoire,25.67,8.04,5.3,AbidjanCôte D'Ivoire,78.206,46.472,,46.4576


In [10]:
# Which city most closely correlates to global temperature change?

cor = []
for ID in totdf['ID'].unique():
    cor.append(np.corrcoef(totdf[totdf['ID'] == ID]['r_mean'].tail(15), totdf[totdf['ID'] == ID]['gbl_r_mean'].tail(15))[1,0])
print(max(cor))
print(totdf['ID'].unique()[np.argmax(cor)])

0.9954157335541769
YamoussoukroCôte D'Ivoire


In [11]:
# a lot more cities north of the equator
totdf.groupby(totdf['lat'] > 0).size().reset_index()

Unnamed: 0,lat,0
0,False,9313
1,True,61445


In [12]:
# Calculate the difference between average city temp and average global temp
totdf['city_gbl_diff'] = abs(totdf['r_mean'] - totdf['gbl_r_mean'])

In [13]:
# Filter totdf for period between 1963 - 2013 where a city's avg temp
# was within 0.1 degrees of the global average temp
last_50_years = totdf[(totdf['year'] > 1963) & (totdf['year'] < 2013) 
                    & (totdf['city_gbl_diff'] < 0.1)]

# Number of times a city's avg temp was within 0.1 degrees of the global
# avg temp from 1963 - 2013
last_50_years['city'].value_counts()

Tbilisi             14
Yerevan             14
Sofia               13
Hamburg             12
Belfast             11
Prague              10
Detroit             10
Dublin              10
Chisinau             3
La Paz               1
Colorado Springs     1
Denver               1
Name: city, dtype: int64

In [14]:
last_50_years[last_50_years['lat'] > 0]['lat'].mean()

46.68787878787869

In [15]:
last_50_years[last_50_years['lat'] < 0]['city']

33769    La Paz
Name: city, dtype: object

In [16]:
last_50_years['city'].unique()

array(['Belfast', 'Chisinau', 'Colorado Springs', 'Denver', 'Detroit',
       'Dublin', 'Hamburg', 'La Paz', 'Prague', 'Sofia', 'Tbilisi',
       'Yerevan'], dtype=object)

In [17]:
# Create plot of average temp in cities close to global temp changes:

# Global average trace:
trace_gbl = [go.Scatter(
    x = totdf.groupby(['year','gbl_r_mean']).size().reset_index(name='count')['year'],
    y = totdf.groupby(['year','gbl_r_mean']).size().reset_index(name='count')['gbl_r_mean'],
    name = 'Global',
    line = dict(width = 5, color = 'black'))]

# Loop through city traces:
trace_city = []
for ID in last_50_years['ID'].unique():
    trace = go.Scatter(
        x = totdf[totdf['ID'] == ID].sort_values(['year'])['year'],
        y = totdf[totdf['ID'] == ID].sort_values(['year'])['r_mean'],
        name = totdf[totdf['ID'] == ID]['city'].unique().tolist()[0],
        opacity = 0.6)
    trace_city.append(trace)

data = trace_city + trace_gbl

layout = go.Layout(
    title = 'Global Temperature',
    xaxis = dict(
        title = 'Year',
        dtick = 5,
        range = [1963, 2013]),
    yaxis = dict(
        title = 'Temperature (°F)',
        dtick = 0.5,
        range = [46, 50]))

fig = {'data':data, 'layout':layout}
iplot(fig)

In [18]:
# Create plot of average temp in Portland (city I live in) to global temp changes:

trace_gbl = go.Scatter(
    x = totdf.groupby(['year','gbl_r_mean']).size().reset_index(name='count')['year'],
    y = totdf.groupby(['year','gbl_r_mean']).size().reset_index(name='count')['gbl_r_mean'],
    name = 'Global')

trace_city = go.Scatter(
    x = totdf[totdf['city'] == 'Sofia'].sort_values(['year'])['year'],
    y = totdf[totdf['city'] == 'Sofia'].sort_values(['year'])['r_mean'],
    name = 'Portland')

data = [trace_city,trace_gbl]

layout = go.Layout(
    title = 'Global Temperature',
    xaxis = dict(
        title = 'Year',
        dtick = 5,
        range = [1963, 2013]),
    yaxis = dict(
        title = 'Temperature (°F)',
        dtick = 0.25,
        range = [47, 50]))

fig = {'data':data, 'layout':layout}
iplot(fig)