# COMP2200 Data science group project - Group 50

#### Malaria and the Factors Affecting its Spread

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
%matplotlib inline

In [2]:
malaria = pd.read_csv("files/malaria_data")
malaria.head()

Unnamed: 0,Country,Year,Cases,Deaths,CasesMedian,CasesMin,CasesMax,DeathsMedian,DeathsMin,DeathsMax,WHORegion
0,Afghanistan,2017,630308[495000-801000],298[110-510],630308,495000.0,801000.0,298,110.0,510.0,Eastern Mediterranean
1,Algeria,2017,0,0,0,,,0,,,Africa
2,Angola,2017,4615605[3106000-6661000],13316[9970-16600],4615605,3106000.0,6661000.0,13316,9970.0,16600.0,Africa
3,Argentina,2017,0,0,0,,,0,,,Americas
4,Armenia,2017,0,0,0,,,0,,,Europe


In [3]:
avg_temp = pd.read_csv("files/average_yearly_temperature.csv")
avg_temp.head()

Unnamed: 0.1,Unnamed: 0,Country name,Average yearly temperature (° C),Unnamed: 3
0,1,Aruba,25.35,
1,2,Andorra,7.6,
2,3,Afghanistan,12.6,
3,4,Angola,21.55,
4,5,Albania,11.4,


In [4]:
HDI = pd.read_csv("files/HDI.csv")
HDI.head()

Unnamed: 0,HDI Rank,Country,1990,Unnamed: 3,1991,Unnamed: 5,1992,Unnamed: 7,1993,Unnamed: 9,...,Unnamed: 51,2015,Unnamed: 53,2016,Unnamed: 55,2017,Unnamed: 57,2018,Unnamed: 59,2019
0,169,Afghanistan,0.302,,0.307,,0.316,,0.312,,...,,0.5,,0.502,,0.506,,0.509,,0.511
1,69,Albania,0.65,,0.631,,0.615,,0.618,,...,,0.788,,0.788,,0.79,,0.792,,0.795
2,91,Algeria,0.572,,0.576,,0.582,,0.586,,...,,0.74,,0.743,,0.745,,0.746,,0.748
3,36,Andorra,..,,..,,..,,..,,...,,0.862,,0.866,,0.863,,0.867,,0.868
4,148,Angola,..,,..,,..,,..,,...,,0.572,,0.578,,0.582,,0.582,,0.581


In [13]:
incomeDist = pd.read_excel("files/IncomeDistributionsWorldwide.xls")
incomeDist.head()

ValueError: File is not a recognized excel file

### Data cleaning

In [None]:
#Remove any NaN and infinite values from malaria table
malaria.replace([np.inf, -np.inf], np.nan, inplace=True)
malaria.dropna(inplace=True)
malaria.head()

In [None]:
#Remove unnecessary columns from average temperature table
avg_temp = avg_temp.drop(columns=['Unnamed: 0', 'Unnamed: 3'])
avg_temp.head()

In [6]:
#Remove unnecessary columns (including years that are not included in our malaria data)
#from HDI table, as well as any NaN/infinite values
HDI = HDI.drop(columns=['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999',  
                        '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', 
                        '2018', '2019'])
HDI.drop(list(HDI.filter(regex = 'Unnamed')), axis = 1, inplace = True)
HDI.replace([np.inf, -np.inf], np.nan, inplace=True)
HDI.dropna(inplace=True)
HDI.head()

Unnamed: 0,HDI Rank,Country,2010,2011,2012,2013,2014,2015,2016,2017
0,169,Afghanistan,0.472,0.477,0.489,0.496,0.5,0.5,0.502,0.506
1,69,Albania,0.745,0.764,0.775,0.782,0.787,0.788,0.788,0.79
2,91,Algeria,0.721,0.728,0.728,0.729,0.736,0.74,0.743,0.745
3,36,Andorra,0.837,0.836,0.858,0.856,0.863,0.862,0.866,0.863
4,148,Angola,0.517,0.533,0.544,0.555,0.565,0.572,0.578,0.582


#### Removing Outliers

#### Checking For outliers

In [None]:
f, ax = plt.subplots(figsize = (15, 10))
sns.boxplot(x = "Year", y = "No. of cases_median", data = malaria).set(title = 'Median Number of Cases BoxPlot - Tens of Millions')

In [None]:
M2014 = malaria[malaria['Year'] == 2014]
columns = ['Country', 'No. of cases_median', 'WHO Region']
print(M2014[columns][(M2014['No. of cases_median'] > 10000000)])

In [None]:
f, ax = plt.subplots(figsize = (15, 10))
sns.boxplot(x = "Year", y = "No. of deaths_median", data = malaria).set(title = 'Median Number of Deaths BoxPlot')

#### Analysis of outliers

The box plots above highlight the spread of the median number of cases and median number of deaths per year in different countries respectively. This is to check for any outliers if there are any. The outliers from the different years are not genuine outliers which need to be removed as those are highlighting the cases within hotspots meaning that, that data is necessary for the analysis. This is further reinforced by the lack of volatility of the outliers from year to year. The same can be said for the box plots for the median number of deaths from year to year. The outliers highlight the deaths in the hotspots and there is no volatility from year to year hence there are no outliers to remove!

# RFE Analysis

Region vs number of infections

In [None]:
from sklearn.feature_selection import RFE

# K Nearest neighbour Analysis

Which variables are the most significant in relation to the spread of malaria?

# Data Visualisation

#### Heatmaps

Display infections in the region

In [None]:
plt.figure(figsize = (10,6))
sns.swarmplot(x = 'Year', y = 'No. of cases_max', data = malaria, hue = 'WHO Region')
plt.legend(bbox_to_anchor= (1, 1), loc = 2)

# Algorithms

Model the effectiveness of preventative measures on the prevalence of the disease