In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import requests
import pickle
import plotly.express as px
from sklearn.cluster import KMeans

### Chargement des données initiales

In [2]:
df = pd.read_csv('./city_temperature.csv')
df.describe()

  df = pd.read_csv('./city_temperature.csv')


Unnamed: 0,Month,Day,Year,AvgTemperature
count,2906327.0,2906327.0,2906327.0,2906327.0
mean,6.469163,15.71682,2006.624,56.00492
std,3.456489,8.800534,23.38226,32.12359
min,1.0,0.0,200.0,-99.0
25%,3.0,8.0,2001.0,45.8
50%,6.0,16.0,2007.0,62.5
75%,9.0,23.0,2013.0,75.5
max,12.0,31.0,2020.0,110.0


In [3]:
df.columns

Index(['Region', 'Country', 'State', 'City', 'Month', 'Day', 'Year',
       'AvgTemperature'],
      dtype='object')

### Data cleaning

In [5]:
# Traitement des années
df.drop(df.Year[df.Year < 1995].index, inplace=True)
df.drop(df.Year[df.Year > 2019].index, inplace=True)
df.Year.unique()

array([1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
       2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
       2017, 2018, 2019], dtype=int64)

In [6]:
# Traitement des jours
df.drop(df.Day[df.Day == 0].index, inplace=True)
df.Day.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
      dtype=int64)

In [7]:
# Traitement des températures
df.drop(df[df.AvgTemperature == -99.0].index, inplace=True)
df[df.AvgTemperature > 100]

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
50515,Africa,Egypt,,Cairo,5,23,2019,100.2
88434,Africa,Guinea-Bissau,,Bissau,4,16,1998,100.1
166361,Africa,Nigeria,,Niamey,4,18,1998,102.8
166367,Africa,Nigeria,,Niamey,4,24,1998,101.4
167122,Africa,Nigeria,,Niamey,5,18,2000,101.8
...,...,...,...,...,...,...,...,...
2235317,North America,US,Nevada,Las Vegas,7,28,2018,102.7
2235321,North America,US,Nevada,Las Vegas,8,1,2018,100.1
2235668,North America,US,Nevada,Las Vegas,7,14,2019,100.3
2235683,North America,US,Nevada,Las Vegas,7,29,2019,100.6


In [8]:
# Conversion des températures en degrés Celsius
df['AvgTemperature'] = df['AvgTemperature'].map(lambda x: (x - 32) * (5/9))

In [9]:
# Nettoyage des noms des villes
dict_correction_city = {'Montvideo' : 'Montevideo', 
                        'Hartford Springfield' : 'Hartford', 
                        'Tampa St. Petersburg' : 'Tampa', 
                        'Minneapolis St. Paul' : 'Minneapolis', 
                        'Raleigh Durham' : 'Durham', 
                        'Akron Canton' : 'Akron', 
                        'Dallas Ft Worth' : 'Dallas', 
                        'Midland Odessa' : 'Midland', 
                        'San Juan Puerto Rico' : 'San Juan'
}

for key, value in dict_correction_city.items():
    df['City'] = df['City'].map(lambda x: value if x == key else x)

### Ajout de données

#### Ajout des coordonnées GPS de chaque station météo via API

In [10]:
def request_APICITY(city):
    api_url = 'https://api.api-ninjas.com/v1/geocoding?city={}'.format(city)
    response = requests.get(api_url, headers={'X-Api-Key': '8TrYo5jsiYgj3ZsTp2E4zQ==E1uuPAfF2D5wRXXH'})
    if response.status_code == requests.codes.ok:
        res = response.json()
        if len(res) > 0:
            return res[0]
    else:
        print("Error:", response.status_code, response.text)
    return None


In [11]:
# list_city = df['City'].unique()
# dict_city = {}
# list_city_error = []

# for city in list_city:
#     return_api = request_APICITY(city)
#     if return_api is not None:   
#         dict_city[city] = {'lat': return_api['latitude'], 'lon': return_api['longitude']}
#     else:
#         list_city_error.append(city)

# print(dict_city)
# print(list_city_error)

#### Ajout de l'altitude de chaque station météo

In [40]:
def request_APIelevation(lat,lon):
    api_url = 'https://api.open-elevation.com/api/v1/lookup?locations={}'.format(str(lat)+","+str(lon))
    response = requests.get(api_url)
    if response.status_code == requests.codes.ok:
        res = response.json()
        if len(res) > 0:
            return res['results'][0]["elevation"]
    else:
        print("Error:", response.status_code, response.text)
    return None

In [None]:
# for key, value in dict_city.items():
#     dict_city[key]['elevation'] = request_APIelevation(value['lat'], value['lon'])

#### Correction manuelle de quelques données

In [12]:
# dict_city['Syracuse']['lat'] = 43.0481221
# dict_city['Syracuse']['lon'] = -76.1474244
# dict_city['Syracuse']['elevation'] = request_APIelevation(dict_city['Syracuse']['lat'], dict_city['Syracuse']['lon'])

# dict_city['Hamilton']['lat'] = 32.2956076
# dict_city['Hamilton']['lon'] = -64.7827048
# dict_city['Hamilton']['elevation'] = request_APIelevation(dict_city['Hamilton']['lat'], dict_city['Hamilton']['lon'])


# dict_city['Toledo']['lat'] = 41.6529143
# dict_city['Toledo']['lon'] = -83.5378173
# dict_city['Toledo']['elevation'] = request_APIelevation(dict_city['Toledo']['lat'], dict_city['Toledo']['lon'])

# dict_city['Panama City']['lat'] = 8.9714493
# dict_city['Panama City']['lon'] = -79.5341802
# dict_city['Panama City']['elevation'] = request_APIelevation(dict_city['Panama City']['lat'], dict_city['Panama City']['lon'])

# dict_city['Birmingham']['lat'] = 33.5206824
# dict_city['Birmingham']['lon'] = -86.8024326
# dict_city['Birmingham']['elevation'] = request_APIelevation(dict_city['Birmingham']['lat'], dict_city['Birmingham']['lon'])

In [13]:
# Enregistrement du dictionnaire
# with open('dict_city.pickle', 'wb') as handle:
#     pickle.dump(dict_city, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [14]:
## read pickle
# with open('dict_city.pickle', 'rb') as handle:
#     dict_city = pickle.load(handle)

#### Ajout des données dans la dataframe

In [16]:
# # Add lat and lon to df, column city is a key in the dict
# df['lat'] = df['City'].map(lambda x: dict_city[x]['lat'])
# df['lon'] = df['City'].map(lambda x: dict_city[x]['lon'])
# df['alt'] = df['City'].map(lambda x: dict_city[x]['elevation'])

In [15]:
# df['date'] = df.apply(lambda row: datetime(row.Year, row.Month, row.Day), axis=1)

In [17]:
# export to csv
# df.to_csv('cleaned_city_temperature.csv', index=False)

### Chargement des données nettoyées

In [2]:
df = pd.read_csv('./cleaned_city_temperature.csv', parse_dates=['date'])
df['Trimestre'] = 'T' + df['date'].dt.quarter.astype(str)

  df = pd.read_csv('./cleaned_city_temperature.csv', parse_dates=['date'])


### Evolution des températures

#### A l'échelle mondiale

In [3]:
fig = px.scatter(df.groupby('Year').mean().reset_index(), 
                    x='Year', 
                    y='AvgTemperature', 
                    trendline='ols')
fig.update_layout(title='Evolution de la température moyenne annuelle', 
                    xaxis_title='Année', 
                    yaxis_title='Température moyenne (°C)')
fig.show()


In [4]:
fig = px.scatter(df.groupby(['Country', 'Year'])
                    .mean()
                    .groupby('Year')
                    .mean()
                    .reset_index(), 
                x='Year', 
                y='AvgTemperature', 
                trendline='ols',
                title='Evolution de la température moyenne mondiale par année',
                labels={'Year': 'Année', 'AvgTemperature': 'Température moyenne (°C)'})
fig.show()

#### A l'échelle régionale

In [5]:
# plot the data
region_temp = df.query('Year < 2020').groupby(['Region', 'Year'])['AvgTemperature'].mean().reset_index()

fig = px.scatter(region_temp, x='Year', y='AvgTemperature', color='Region', trendline='ols', opacity=1)
fig.update_layout(title='Evolution de la température moyenne par région', 
                    xaxis_title='Année', 
                    yaxis_title='Température moyenne (°C)')
fig.show()


In [6]:
from sklearn.linear_model import LinearRegression

# On ajoute de nouvelles années pour chaque région
region_years = []
for region in region_temp['Region'].unique():
    years = list(range(1995, 2031))
    region_years.append(pd.DataFrame({'Year': years, 'Region': region}))

region_years = pd.concat(region_years)
region_temp_extended = pd.merge(region_years, region_temp, on=['Region', 'Year'], how='outer').fillna(0)

region_pred = {}
# Pour toutes les régions, on prédit les températures pour les années 2020 à 2030
# à l'aide d'une régression linéaire
for region in region_temp_extended['Region'].unique():
    # Fit
    X = region_temp_extended[(region_temp_extended.Year < 2020) & (region_temp_extended.Region == region)][['Year']]
    y = region_temp_extended[(region_temp_extended.Year < 2020) & (region_temp_extended.Region == region)]['AvgTemperature']
    reg = LinearRegression().fit(X, y)

    # Predict
    X_pred = region_temp_extended[(region_temp_extended.Year >= 2020) & (region_temp_extended.Region == region)][['Year']]
    y_pred = reg.predict(X_pred)    
    region_pred[region] = y_pred

# On fait une jointure des predictions avec notre DF
for region in region_pred:
    region_temp_extended.loc[(region_temp_extended.Year >= 2020) & (region_temp_extended.Region == region), 'AvgTemperature'] = region_pred[region]

# On affiche le graphique !
fig = px.scatter(region_temp_extended, x='Year', y='AvgTemperature', color='Region', trendline='ols', opacity=1)
fig.update_layout(title='Evolution de la température moyenne par région', 
                    xaxis_title='Année', 
                    yaxis_title='Température moyenne (°C)')

# Ajouter une ligne de démarcation verticale pour 2020
fig.add_shape(type='line', x0=2020, y0=0, x1=2020, y1=region_temp_extended['AvgTemperature'].max()+1, line=dict(color='red', width=2, dash='dash'))
fig.show()



### Détection de saisonnalité

#### Quelques cas exemples

In [209]:
df_groupPaysDate = df.groupby(['Country','date']).mean()
df_groupPaysDate.reset_index(inplace=True)

In [245]:

def plot_evol_temp_moy_byannee(pays, annee):
    data = df_groupPaysDate[df_groupPaysDate.Country == pays][df_groupPaysDate.Year == annee]
    fig = px.scatter(data, x='date', y='AvgTemperature', color='Country')
    fig.update_layout(title='Température moyenne en {} pour l\'année {}'.format(pays, annee),
                        xaxis_title='Date',
                        yaxis_title='Température moyenne')
    fig.show()

def plot_evol_temp_moy(pays):
    data = df_groupPaysDate[df_groupPaysDate.Country == pays]
    data = data.groupby(['Month','Day'])['AvgTemperature'].mean()

    data = data.reset_index()
    data["date"] = data['Month'].astype(int).astype(str) + '/' + data['Day'].astype(int).astype(str)

    # Use plotly.express
    fig = px.scatter(data, x="date", y="AvgTemperature")
    fig.update_layout(title='Température moyenne en {} par jour'.format(pays),
                        xaxis_title='Date',
                        yaxis_title='Température moyenne')
    fig.show()

In [246]:
plot_evol_temp_moy_byannee('France', 2019)
plot_evol_temp_moy('France')


Boolean Series key will be reindexed to match DataFrame index.



In [247]:
plot_evol_temp_moy_byannee('Argentina', 2019)
plot_evol_temp_moy('Argentina')


Boolean Series key will be reindexed to match DataFrame index.



In [248]:
plot_evol_temp_moy_byannee('Equador', 2019)
plot_evol_temp_moy('Equador')


Boolean Series key will be reindexed to match DataFrame index.



#### Visualisation pour tous les pays

In [250]:
from plotly.subplots import make_subplots
import plotly.graph_objs as go

# get unique countries
countries = df_groupPaysDate.Country.unique()

# create subplots
fig = make_subplots(rows=len(countries)//3 + 1, cols=3, subplot_titles=countries)

# loop through countries and plot scatter plot
for i, country in enumerate(countries):
    data = df_groupPaysDate[df_groupPaysDate.Country == country][df_groupPaysDate.Year == 2019.0]
    fig.add_trace(go.Scatter(x=data['date'], y=data['AvgTemperature'], mode='markers', marker=dict(color='DarkBlue', size=3)), row=i//3+1, col=i%3+1)

# set common x and y labels
fig.update_xaxes(title_text='Date', row=len(countries)//3 + 1, col=2)
fig.update_yaxes(title_text='Average Temperature', row=2, col=1)

# update layout
fig.update_layout(height=5000, width=700, showlegend=False)

# show the plot
fig.show()



Boolean Series key will be reindexed to match DataFrame index.



#### Clusterisation des pays par saisonnalité

In [14]:
def temp_change(x):
    if x > 1:
        return 1
    elif x < -1:
        return -1
    else:
        return 0

df_temp_change = df.groupby(['Country', 'Month'])['AvgTemperature'].mean().reset_index()
df_temp_change['TempChange'] = df_temp_change.groupby('Country')['AvgTemperature'].diff().apply(temp_change).fillna(0)

# On pivote notre df pour avoir les mois en colonnes
df_pivot = df_temp_change.pivot(index='Country', columns='Month', values='TempChange')

# Création de K-means
kmeans = KMeans(n_clusters=4)

# On fit et on ajoute les clusters à notre df_pivot.
kmeans.fit(df_pivot)
df_pivot['cluster'] = kmeans.labels_

# On ajoute les clusters à notre df_temp_change
df_temp_change = pd.merge(df_temp_change, df_pivot['cluster'], on='Country', how='left')

In [15]:
df_grouped = df_temp_change.groupby(['Month','cluster'])['AvgTemperature'].mean().reset_index().pivot(index='Month', columns='cluster', values='AvgTemperature')

# Affichage des clusters
fig = px.line(df_grouped, x=df_grouped.index, y=df_grouped.columns)
fig.update_layout(title='Température moyenne par mois et par cluster', 
                    xaxis_title='Mois', 
                    yaxis_title='Température moyenne (°C)')
fig.show()

In [16]:
countries_by_cluster = df_temp_change.groupby(['Country']).mean().reset_index().groupby('cluster')['Country'].apply(lambda x: ', '.join(x)).reset_index()

for index, row in countries_by_cluster.iterrows():
    print('Cluster {}: {}'.format(row['cluster'], row['Country']))


Cluster 0.0: Albania, Algeria, Austria, Bahrain, Belarus, Belgium, Bermuda, Bulgaria, Canada, China, Croatia, Cyprus, Czech Republic, Denmark, Egypt, Finland, France, Georgia, Germany, Greece, Hungary, Iceland, Ireland, Israel, Italy, Japan, Jordan, Kazakhstan, Kuwait, Kyrgyzstan, Latvia, Lebanon, Macedonia, Mongolia, Morocco, North Korea, Norway, Poland, Portugal, Qatar, Romania, Russia, Saudi Arabia, Serbia-Montenegro, Slovakia, South Korea, Spain, Sweden, Switzerland, Syria, Taiwan, Tajikistan, The Netherlands, Tunisia, Turkey, Turkmenistan, US, Ukraine, United Arab Emirates, United Kingdom, Uzbekistan, Yugoslavia
Cluster 1.0: Argentina, Australia, Bolivia, Brazil, Kenya, Madagascar, Malawi, Mozambique, Namibia, New Zealand, Peru, South Africa, Uruguay, Zambia
Cluster 2.0: Bahamas, Bangladesh, Belize, Burundi, Cuba, Gambia, Honduras, Hong Kong, India, Laos, Mauritania, Mexico, Nepal, Oman, Pakistan, Senegal, Vietnam
Cluster 3.0: Barbados, Benin, Central African Republic, Colombia, C

In [35]:
# create a dataframe with country and cluster information
countries_by_cluster = df_temp_change.groupby(['Country']).mean().reset_index()
countries_by_cluster['cluster'] = countries_by_cluster['cluster'].astype(str)

# create a choropleth map
fig = px.choropleth(countries_by_cluster, 
                    locations='Country',
                    locationmode='country names',
                    color='cluster',
                    hover_name='Country',
                    title='Regroupement de pays par saisonnalité')
fig.update_layout(showlegend=False, title_x=0.5)
fig.show()


### Clustering de données

In [7]:
df_city = df.groupby(['City']).mean().reset_index()
df_city.sample(2)

Unnamed: 0,City,Month,Day,Year,AvgTemperature,lat,lon,alt
81,Columbus,6.521918,15.72149,2007.024335,15.402293,39.96226,-83.000707,241.0
126,Green Bay,6.522685,15.727892,2007.021861,7.764034,44.512638,-88.012579,181.0


In [11]:
kmeans = KMeans(n_clusters=4).fit(df_city['AvgTemperature'].values.reshape(-1, 1))
df_city['clusters'] = kmeans.labels_.astype(str)
fig = px.scatter_geo(df_city, lat='lat', lon='lon', color='clusters', hover_name='City', hover_data=['AvgTemperature'])
fig.show()


In [12]:
from sklearn.preprocessing import StandardScaler
df_city3 = df_city.copy()

scaler = StandardScaler()
df_city3[['AvgTemperature', 'alt']] = scaler.fit_transform(df_city[['AvgTemperature', 'alt']])
kmeans = KMeans(n_clusters=4).fit(df_city3[['AvgTemperature','alt']])
df_city3['clusters'] = kmeans.labels_.astype(str)

fig = px.scatter_geo(df_city3, lat='lat', lon='lon', color='clusters', hover_name='City', hover_data=['AvgTemperature', 'alt'])
fig.show()


In [283]:
# I want a clustering of the cities based on their temperature and their location
df_city2 = df.groupby(['City','lon','lat']).agg(meanT = ('AvgTemperature', 'mean'), maxT = ('AvgTemperature', 'max'), minT = ('AvgTemperature', 'min')).reset_index()
df_city2.head(2)

Unnamed: 0,City,lon,lat,meanT,maxT,minT
0,Abidjan,-4.016107,5.320357,26.775635,31.444444,4.611111
1,Abilene,-99.747591,32.44645,18.245496,34.555556,-10.277778


In [284]:
kmeans = KMeans(n_clusters=4).fit(df_city2[['meanT','maxT','minT']].values)
df_city2['clusters'] = kmeans.labels_.astype(str)
px.scatter(df_city2, x='lon', y='lat', color='clusters', hover_name='City', hover_data=['meanT','maxT','minT'])

### Prédictions de températures

#### Régression linéaire entre latitude/alititude et température

In [8]:
from sklearn.linear_model import LinearRegression

X = df_city[['alt','lat']]
X['lat'] = X['lat'].map(lambda x: abs(x))
y = df_city['AvgTemperature']

reg = LinearRegression().fit(X, y)
reg.score(X, y)

df_city['predT'] = reg.predict(X)

df_city['ecartT'] = df_city['predT'] - df_city['AvgTemperature']
df_city.sample(5)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,City,Month,Day,Year,AvgTemperature,lat,lon,alt,predT,ecartT
212,North Platte,6.526911,15.728032,2007.011533,9.522286,41.123887,-100.765423,856.0,10.712814,1.190528
220,Paducah,6.524605,15.716608,2007.013291,14.676461,37.083389,-88.600048,104.0,15.762748,1.086287
204,Nashville,6.521944,15.722076,2007.009985,15.816223,36.162277,-86.774298,130.0,16.07883,0.262607
95,Des Moines,6.522383,15.7246,2007.007242,10.78394,41.591032,-93.604665,254.0,13.043217,2.259277
152,Kampala,6.634641,15.862727,2007.847197,22.535653,0.317714,32.581354,1234.0,27.988885,5.453232


In [10]:
fig = px.scatter_geo(df_city, lat='lat', lon='lon', color='ecartT', hover_name='City', hover_data=['AvgTemperature','predT','alt'], color_continuous_scale='rdbu')
fig.update_layout(title='Ecart entre la température théorique et la température réelle',
                    geo_scope='world')
fig.show()

#### Clustering ecart Temperature

In [13]:
# Clustering ecartT
kmeans = KMeans(n_clusters=4).fit(df_city[['AvgTemperature', 'ecartT']].values)
df_city['clusters'] = kmeans.labels_.astype(str)

fig = px.scatter_geo(df_city, lat='lat', lon='lon', color='clusters', hover_name='City', hover_data=['AvgTemperature','predT','alt','ecartT'])
fig.update_layout(title='Clustering des villes par écart de température', geo_scope='world')
fig.show()
