# Cleaning Data

## Imports

https://pandas.pydata.org/docs/getting_started/10min.html

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Importing csv

In [5]:
### Read CSV ###
# df = pd.read_csv('folder\folder\data_file.csv', encoding='UTF_16_LE')

### Data size (rows, columns) ###
# print(df.shape)

### Data index and columns ###
# df.index
# df.columns

### Data types of each column ###
# df.dtypes

### Data preview ###
# df.head(5)
# df.tail(3)

## Queries

In [None]:
### Basic stats ###
# df.describe()
# df.column_name.describe()

### Select a column ###
# df[‘column_name’]

### Select the first 10 rows of a column ###
# df[‘column_name’][:10]

### Select multiple columns ###
# df[[‘column01’,’column02’]]

### Select all movies over two hours long ###
# df[df[‘column_name’] > 120]

## Columns

In [None]:
# Dropping Columns
def drop_multiple_col(col_names_list, df): 
    '''
    AIM    -> Drop multiple columns based on their column names 
    
    INPUT  -> List of column names, df
    
    OUTPUT -> updated df with dropped columns 
    ------
    '''
    df.drop(col_names_list, axis=1, inplace=True)
    return df



## Nulls

In [1]:
def check_missing_data(df):
    # check for any missing data in the df (display in descending order)
    return df.isnull().sum().sort_values(ascending=False)

def remove_col_white_space(df,col):
    # remove white space at the beginning of string 
    df[col] = df[col].str.lstrip()

## Convert Categorical to Numbers

In [None]:
def change_dtypes(col_int, col_float, df): 
    '''
    AIM    -> Changing dtypes to save memory
     
    INPUT  -> List of column names (int, float), df
    
    OUTPUT -> updated df with smaller memory  
    ------
    '''
    df[col_int] = df[col_int].astype('int32')
    df[col_float] = df[col_float].astype('float32')
    
def convert_cat2num(df):
    # Convert categorical variable to numerical variable
    num_encode = {'col_1' : {'YES':1, 'NO':0},
                  'col_2'  : {'WON':1, 'LOSE':0, 'DRAW':0}}  
    df.replace(num_encode, inplace=True)  
    
def concat_col_str_condition(df):
    # concat 2 columns with strings if the last 3 letters of the first column are 'pil'
    mask = df['col_1'].str.endswith('pil', na=False)
    col_new = df[mask]['col_1'] + df[mask]['col_2']
    col_new.replace('pil', ' ', regex=True, inplace=True)  # replace the 'pil' with emtpy space
    
def convert_str_datetime(df): 
    '''
    AIM    -> Convert datetime(String) to datetime(format we want)
     
    INPUT  -> df
    
    OUTPUT -> updated df with new datetime format 
    ------
    '''
    df.insert(loc=2, column='timestamp', value=pd.to_datetime(df.transdate, format='%Y-%m-%d %H:%M:%S.%f')) 