# Projet de Fouille de données

## Etape II : Nettoyage des données

### Import des librairies

On commence par importer les librairies.

In [1]:
import pandas
import numpy

### Import des données

Puis, on importe les données de notre jeu de données, et on les "wrap" dans un DataFrame de Pandas.

In [2]:
data = pandas.read_csv("../data/corrupted_data.csv")
dataFrame = pandas.DataFrame(data)

### Nettoyage des données

#### Renommage de la première colonne en "id"

On renomme la première colonne avec un nom correct. 

In [3]:
cleanedDataFrame = dataFrame.rename(columns={'Unnamed: 0': 'id'})

In [4]:
list(cleanedDataFrame)

['id', 'date', 'county', 'state', 'fips', 'cases', 'deaths']

#### Suppression des lignes avec des valeurs manquantes (cellules vides, et "Unknown")

In [5]:
cleanedDataFrame = cleanedDataFrame.dropna()

In [6]:
len(cleanedDataFrame)

775346

In [7]:
cleanedDataFrame.count()

id        775346
date      775346
county    775346
state     775346
fips      775346
cases     775346
deaths    775346
dtype: int64

In [8]:
cleanedDataFrame.count() - len(cleanedDataFrame)

id        0
date      0
county    0
state     0
fips      0
cases     0
deaths    0
dtype: int64

In [9]:
cleanedDataFrame = cleanedDataFrame.drop(cleanedDataFrame[cleanedDataFrame["county"]=="Unknown"].index)

In [10]:
len(cleanedDataFrame)

775346

In [11]:
cleanedDataFrame[cleanedDataFrame["county"]=="Unknown"]

Unnamed: 0,id,date,county,state,fips,cases,deaths


In [12]:
len(cleanedDataFrame[cleanedDataFrame["county"]=="Unknown"]["county"])

0

#### Conversion au même format pour les dates

In [13]:
 cleanedDataFrame['date'] = cleanedDataFrame['date'].replace(to_replace = "^([0-9]{4})\.([0-9]{2})\.([0-9]{2}){1}$", value = "\\1-\\2-\\3", regex=True)

In [14]:
cleanedDataFrame[cleanedDataFrame['date'].str.contains('^[0-9]{4}\.[0-9]{2}\.[0-9]{2}$', regex = True, na = False)]

Unnamed: 0,id,date,county,state,fips,cases,deaths


In [15]:
len(cleanedDataFrame[cleanedDataFrame['date'].str.contains('^[0-9]{4}\.[0-9]{2}\.[0-9]{2}$', regex = True, na = False)]['date'])

0

#### Suppression des "-" dans les colonnes quantitatives (fips, cases, deaths)

In [16]:
cleanedDataFrame['fips'] = cleanedDataFrame['fips'].apply(str).replace(to_replace = "^(-)(.+)$", value = "\\2", regex=True)

In [17]:
cleanedDataFrame[cleanedDataFrame['fips'].apply(str).str.contains('^-.+$', regex = True, na = False)]['fips']

Series([], Name: fips, dtype: object)

In [18]:
cleanedDataFrame['cases'] = cleanedDataFrame['cases'].apply(str).replace(to_replace = "^(-)(.+)$", value = "\\2", regex=True)

In [19]:
cleanedDataFrame[cleanedDataFrame['cases'].apply(str).str.contains('^-.+$', regex = True, na = False)]['cases']

Series([], Name: cases, dtype: object)

In [20]:
cleanedDataFrame['deaths'] = cleanedDataFrame['deaths'].apply(str).replace(to_replace = "^(-)(.+)$", value = "\\2", regex=True)

In [21]:
cleanedDataFrame[cleanedDataFrame['deaths'].apply(str).str.contains('^-.+$', regex = True, na = False)]['deaths']

Series([], Name: deaths, dtype: object)

#### Transformation des colonnes quantitatives en entiers (fips, cases, deaths)

In [22]:
cleanedDataFrame["fips"] = pandas.to_numeric(cleanedDataFrame["fips"], downcast='integer')

In [23]:
cleanedDataFrame["fips"]

0         53061
1         53061
2         53061
3         17031
4         53061
          ...  
800432    56037
800433    56039
800434    56041
800435    56043
800436    56045
Name: fips, Length: 775346, dtype: int32

In [24]:
cleanedDataFrame["cases"] = pandas.to_numeric(cleanedDataFrame["cases"], downcast='integer')

In [25]:
cleanedDataFrame["cases"]

0            1
1            1
2            1
3            1
4            1
          ... 
800432    2098
800433    1739
800434    1187
800435     519
800436     419
Name: cases, Length: 775346, dtype: int32

In [26]:
cleanedDataFrame["deaths"] = pandas.to_numeric(cleanedDataFrame["deaths"], downcast='integer')

In [27]:
cleanedDataFrame["deaths"]

0          0
1          0
2          0
3          0
4          0
          ..
800432    10
800433     2
800434     5
800435     8
800436     2
Name: deaths, Length: 775346, dtype: int16

### Export des données nettoyées

In [28]:
cleanedDataFrame.to_csv("..\data\cleaned_data.csv", index = False)