In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
import faculty.datasets as datasets
from pandas.api.types import CategoricalDtype
import matplotlib.ticker as mtick

from pandas import Series, DataFrame
import os
from functools import reduce

In [2]:
# import daily temperature data for each country
dailytem = pd.read_csv('/project/city_temperature.csv', low_memory=False)
all_football_18 = pd.read_csv('/project/all_football_18.csv')

In [3]:
# drop some uncessary columns, i.e. region, city, state 
# because we only need to know about country-level information
dailytem = dailytem.drop(columns = ['Region','State','City'])
dailytem

Unnamed: 0,Country,Month,Day,Year,AvgTemperature
0,Algeria,1,1,1995,64.2
1,Algeria,1,2,1995,49.4
2,Algeria,1,3,1995,48.8
3,Algeria,1,4,1995,46.4
4,Algeria,1,5,1995,47.9
...,...,...,...,...,...
2906322,US,7,27,2013,82.4
2906323,US,7,28,2013,81.6
2906324,US,7,29,2013,84.2
2906325,US,7,30,2013,83.8


In [4]:
# World Cup 2018 was hold between June and July
# so only temperature of June-July 2018 for countries would be kept for comparison
dailytem_2018 = dailytem[dailytem["Year"] == 2018]
dailytem_2018_6_7 = dailytem_2018[(dailytem_2018["Month"] == 6) | (dailytem_2018["Month"] == 7)]
dailytem_2018_6_7

Unnamed: 0,Country,Month,Day,Year,AvgTemperature
8553,Algeria,6,1,2018,67.1
8554,Algeria,6,2,2018,71.4
8555,Algeria,6,3,2018,68.2
8556,Algeria,6,4,2018,68.8
8557,Algeria,6,5,2018,67.2
...,...,...,...,...,...
2898883,US,7,27,2018,59.2
2898884,US,7,28,2018,61.4
2898885,US,7,29,2018,61.9
2898886,US,7,30,2018,60.4


In [5]:
annualtem = dailytem_2018_6_7.groupby(["Country"]).mean()
annualtem = annualtem.drop(columns = ["Month","Day", "Year"])
annualtem

Unnamed: 0_level_0,AvgTemperature
Country,Unnamed: 1_level_1
Albania,75.181967
Algeria,75.290164
Argentina,49.514754
Australia,53.295082
Austria,70.360656
...,...
Uruguay,50.483607
Uzbekistan,84.480328
Venezuela,59.895082
Vietnam,85.481967


In [6]:
annualtem = annualtem.reset_index()
annualtem

Unnamed: 0,Country,AvgTemperature
0,Albania,75.181967
1,Algeria,75.290164
2,Argentina,49.514754
3,Australia,53.295082
4,Austria,70.360656
...,...,...
104,Uruguay,50.483607
105,Uzbekistan,84.480328
106,Venezuela,59.895082
107,Vietnam,85.481967


In [7]:
# Only country on the World Cup list are needed for analysis
list_of_countries = list(all_football_18["country_name"])

In [8]:
 # other irrelavant countried should be dropped
countries_2018_6_7 = annualtem[annualtem["Country"].isin(list_of_countries)]
countries_2018_6_7 = countries_2018_6_7.reset_index(drop = True)
countries_2018_6_7

Unnamed: 0,Country,AvgTemperature
0,Argentina,49.514754
1,Australia,53.295082
2,Belgium,67.062295
3,Brazil,67.434973
4,Colombia,57.019672
5,Costa Rica,73.106557
6,Croatia,72.696721
7,Denmark,65.755738
8,Egypt,86.15082
9,France,57.033607


In [9]:
# It could be found that Only 28 countries are in the list, so 4 countries are missing
missing_countries_tem = pd.merge(annualtem, all_football_18, 
  left_on='Country', 
  right_on='country_name',
    how="right"
)

missing_countries_tem= missing_countries_tem[missing_countries_tem.isna().any(axis=1)]
missing_countries_tem

Unnamed: 0.1,Country,AvgTemperature,Unnamed: 0,country_name,Final_positional_ranking,goal,shotsPerGame,yellow_card,red_card,possession_percentage,...,right_side,own_third,middle_third,opposition_third,shot_left_side,shot_middle_side,shot_right_side,6_yard_box,18_yard_box,outside_box
9,,,9,England,4,12,13.4,8,0,53.5,...,39%,22%,49%,30%,11%,73%,16%,12%,53%,35%
13,,,13,Iran,19,2,7.7,7,0,33.6,...,32%,34%,47%,20%,17%,75%,8%,4%,52%,43%
19,,,19,Peru,21,2,10.3,5,0,49.9,...,36%,31%,46%,23%,19%,58%,23%,0%,61%,39%
25,,,25,Serbia,18,2,11.0,9,0,44.2,...,34%,25%,46%,30%,12%,73%,15%,6%,58%,36%


In [10]:
# "England", "Iran", "Peru", "Serbia" are missed in the sorted tem dataset
missing_name_reference = list(annualtem["Country"])
missing_name_reference

[t.split() for t in missing_name_reference if t.startswith(('United', 'Serbia', 'Persia', 'Pe'))]

# It can be found England exists in the dataframe with the name "United Kingdom"

['Albania',
 'Algeria',
 'Argentina',
 'Australia',
 'Austria',
 'Bahamas',
 'Bahrain',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bolivia',
 'Brazil',
 'Bulgaria',
 'Canada',
 'Central African Republic',
 'China',
 'Colombia',
 'Congo',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Czech Republic',
 'Denmark',
 'Dominican Republic',
 'Egypt',
 'Equador',
 'Ethiopia',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Germany',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Haiti',
 'Honduras',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Ireland',
 'Italy',
 'Ivory Coast',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kuwait',
 'Kyrgyzstan',
 'Laos',
 'Latvia',
 'Lebanon',
 'Macedonia',
 'Madagascar',
 'Malaysia',
 'Mexico',
 'Mongolia',
 'Morocco',
 'Mozambique',
 'Myanmar (Burma)',
 'Namibia',
 'Nepal',
 'New Zealand',
 'Nicaragua',
 'Nigeria',
 'North Korea',
 'Norway',
 'Oman',
 'Pakistan',
 'Panama',
 'Philippines',
 'Poland',
 'Portugal',
 'Qatar',
 'Romania',
 'Russia

[['United', 'Arab', 'Emirates'], ['United', 'Kingdom']]

In [11]:
# change the name 'United Kingdom' into 'England' in the dataset
annualtem['Country'] = annualtem['Country'].replace({'United Kingdom': 'England'})
# regenerate new country list
countries_2018_6_7 = annualtem[annualtem["Country"].isin(list_of_countries)]
countries_2018_6_7 = countries_2018_6_7.reset_index(drop = True)
countries_2018_6_7
# Now there are 29 countries presented. Only 'Iran', 'Peru', 'Serbia' are missing
# the data of temperature of these three countries would be found to fill the missing value.

Unnamed: 0,Country,AvgTemperature
0,Argentina,49.514754
1,Australia,53.295082
2,Belgium,67.062295
3,Brazil,67.434973
4,Colombia,57.019672
5,Costa Rica,73.106557
6,Croatia,72.696721
7,Denmark,65.755738
8,Egypt,86.15082
9,France,57.033607


In [12]:
# With the relationship between Fahrenheit unit and Celsius unit T ℉ = 1.8t℃ + 32,
# so to convert the data unit into Celsius: t℃ = (T ℉-32)/1.8
countries_2018_6_7["AvgTemperature"] = (countries_2018_6_7["AvgTemperature"] - 32)/1.8
countries_2018_6_7

Unnamed: 0,Country,AvgTemperature
0,Argentina,9.730419
1,Australia,11.830601
2,Belgium,19.479053
3,Brazil,19.686096
4,Colombia,13.899818
5,Costa Rica,22.836976
6,Croatia,22.60929
7,Denmark,18.753188
8,Egypt,30.083789
9,France,13.907559


In [13]:
# For these three countries, the latest temperature data could be found is up to 2013.
yearlytem2013 = pd.read_csv('/project/matYearCountry.csv')
yearlytem2013
yearlytem2013 = yearlytem2013.loc[13, ['Peru', 'Iran', 'Serbia']]
yearlytem2013

Unnamed: 0,year,Åland,Afghanistan,Africa,Albania,Algeria,American Samoa,Andorra,Angola,Anguilla,...,Uruguay,Uzbekistan,Venezuela,Vietnam,Virgin Islands,Western Sahara,Yemen,Zambia,Zimbabwe,year.1
0,2000,7.283333,15.497833,24.588083,13.746583,24.03225,27.219833,12.311917,22.360667,27.05275,...,17.75675,13.938833,25.3885,23.981917,26.730583,23.020083,27.436167,21.55875,21.149,
1,2001,6.4015,15.778083,24.841167,13.690417,24.724417,27.227583,12.216083,22.502917,27.4085,...,18.57425,13.998,25.771333,24.184917,27.133167,23.704917,27.068917,21.802417,21.693667,
2,2002,6.801083,15.537667,24.961333,13.559917,24.263,27.537917,12.319583,22.9255,27.53825,...,17.993917,13.871917,25.772167,24.39375,27.297833,23.457833,27.125167,22.292417,22.081083,
3,2003,6.323417,14.916,25.025583,13.626583,24.393667,27.264833,12.900667,23.066583,27.677417,...,17.604667,13.157667,26.008333,24.49775,27.376583,23.408917,27.35775,22.15675,21.89125,
4,2004,6.518667,15.770917,24.895917,13.258167,23.916,27.281167,12.043,22.572833,27.334,...,18.060833,14.442,25.817833,24.056917,27.02925,23.420667,27.334083,21.768083,21.330917,
5,2005,6.724083,14.98,25.156167,12.747083,24.222583,27.3735,11.643417,22.966083,27.732333,...,18.194417,14.044,26.019417,24.298,27.424667,23.507667,27.294667,22.8135,22.509917,
6,2006,7.100583,15.595583,24.818917,12.976917,24.200833,27.08125,12.834333,21.947083,27.63625,...,18.240417,13.78625,25.849917,24.523083,27.374833,23.441833,27.175,21.78425,21.61875,
7,2007,7.029,15.106167,24.8515,13.8905,24.065333,27.452417,12.15525,22.427083,27.685083,...,17.516167,13.928333,25.854083,24.302167,27.47,23.145417,27.31075,21.8385,21.60425,
8,2008,7.49225,15.2165,24.754667,13.955167,23.95025,26.995083,11.978583,22.287333,27.140417,...,18.2805,13.309,25.64825,23.784,26.924,23.323083,26.8075,21.535833,21.545583,
9,2009,6.489083,15.25775,25.0265,13.84425,24.154333,27.03425,12.566667,22.3165,27.468583,...,17.871333,13.700333,26.084917,24.465583,27.2385,23.381083,27.342417,21.67025,21.37725,


Peru      19.976250
Iran      20.541000
Serbia    12.843625
Name: 13, dtype: float64

In [29]:
addcountries = DataFrame({'Country':["Peru",'Iran', 'Serbia'], 'AvgTemperature': [19.976250, 20.541000, 12.843625]},)
countries_2018_6_7 = countries_2018_6_7.append(addcountries, ignore_index = True)
countries_2018_6_7

Unnamed: 0,Country,AvgTemperature
0,Argentina,9.730419
1,Australia,11.830601
2,Belgium,19.479053
3,Brazil,19.686096
4,Colombia,13.899818
5,Costa Rica,22.836976
6,Croatia,22.60929
7,Denmark,18.753188
8,Egypt,30.083789
9,France,13.907559


In [30]:
# Since the World Cup 2018 was held in Russia, the temperature difference between each country with Russia
# is supposed to be calculated
countries_2018_6_7["TemperatureDifference"] = countries_2018_6_7["AvgTemperature"] - countries_2018_6_7.loc[19,"AvgTemperature"]
countries_2018_6_7

Unnamed: 0,Country,AvgTemperature,TemperatureDifference
0,Argentina,9.730419,-8.818761
1,Australia,11.830601,-6.718579
2,Belgium,19.479053,0.929872
3,Brazil,19.686096,1.136916
4,Colombia,13.899818,-4.649362
5,Costa Rica,22.836976,4.287796
6,Croatia,22.60929,4.060109
7,Denmark,18.753188,0.204007
8,Egypt,30.083789,11.534608
9,France,13.907559,-4.641621


In [32]:
countries_2018_6_7.to_csv("/project/temdifference.csv")