## Data wrangling/cleaning using Python:

- Deleting columns
- Rearranging columns
- Filtering and subsetting

In [None]:
import pandas as pd

In [None]:
# to import a file into a pandas DataFrame
data = pd.read_csv('merged_clean_ver1.csv')

# to display the dataframe
data

In [None]:
data.columns

### deleting columns

In [None]:
# deleting columns
data = data.drop(['TCODE']) # Explain the argument axis, when axis is 0 and 1

In [None]:
data = data.drop(['TCODE'], axis=1) # hint: is TCODE present in columns?

In [None]:
data = data.drop(['tcode'], axis=1)

In [None]:
data.columns

### Rearranging columns

In [None]:
# Rearranging columns
data = data[['id', 'state', 'gender', 'median_home_val', 'median_household_income', 'ic2', 'ic3', 'ic4', 'ic5', 'avggift', 'domain', 'dob', 'target_d']]

In [None]:
data

### filtering and subsetting

In [None]:
# filtering and subsetting -- using conditions with DataFrame
data[data['gender']=='M']

In [None]:
data[data['gender'].isin(['M', 'F'])]

In [None]:
data[(data['gender']=='M') | (data['gender']=='F')]

In [None]:
data[data['target_d']>100]

## Key concepts - 3

- Reset index
- Working with indexes

In [None]:
data

### filter and reset the index

In [None]:
#filter and reset the index

# In this section again emphasize on the importance of playing with the code and checking the output

filtered = data[data['gender']=='M']  # Lets say that we are working on this filtered data

In [None]:
filtered

In [None]:
# filtered
filtered = filtered.reset_index(drop=True) # what will happen after resetting the index?

In [None]:
filtered

In [None]:
temp = filtered.copy()
temp.set_index('state') # This is a dummy case, but indexes should be unique and not nulls, usually auto-increments by 1

In [None]:
# Working with indexes
filtered[1:4]

In [None]:
filtered[['gender', 'ic2', 'ic3']][0:10]

In [None]:
filtered.loc[1:3]

In [None]:
filtered.loc[100]

In [None]:
filtered.iloc[1:3]

In [None]:
# now, working just on the indexes row,columns
filtered.iloc[1:10,0:4]

In [None]:
filtered.iloc[[1,2,3,4],[0,2,4]]

## Key concepts - 4

- Correcting data types
- Removing duplicates

### data types

In [None]:
data.dtypes

In [None]:
data._get_numeric_data()

In [None]:
data._get_bool_data()

In [None]:
data.select_dtypes('object')

###  correcting data types

In [None]:
# will this work? why/why not?
pd.to_numeric(data['median_home_val'])

In [None]:
data['median_home_val'] =  pd.to_numeric(data['median_home_val'], errors='coerce')

In [None]:
data

In [None]:
data['ic5'] =  pd.to_numeric(data['ic5'], errors='coerce')

In [None]:
data._get_numeric_data() # to check if 'median_home_val' and 'ic5' are now listed as numeric data

### Removing duplicates

In [None]:
# Removing duplicates
data = data.drop_duplicates()  # play around with the code, show them how to use keep argument

In [None]:
data

In [None]:
# temp = temp.drop_duplicates(subset=['state','gender', 'ic2', 'ic3'])
# if we want to remove duplicates based on some specific columns