Data wrangling is a broad term used, often informally, to describe the process of transformaing  raw data to clean data and organized format ready to use.

The most common data structure  used to "wrangle data" is the data frame, which can be both intuitive and incredibly versatile .

Dataframes are tabular

In [None]:
import pandas as pd

url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

df = pd.read_csv(url)

df.head()

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0


In [None]:
#1. You want to create a new DataFrame

dataframe = pd.DataFrame()

#Add columns 
dataframe['Name'] = ['Jacky Jackson', 'Steven Stevenson']
dataframe['Age'] = [38, 25]
dataframe['Driver'] = [True, False]

print(dataframe)

# Alternatively, once we have created a dataFrame object,  we can append new rows to 
# the bottom

new_person = pd.Series(['Mooney Mooney',40,True],index=['Name','Age','Driver'])

dataframe.append(new_person,ignore_index=True)

               Name  Age  Driver
0     Jacky Jackson   38    True
1  Steven Stevenson   25   False


Unnamed: 0,Name,Age,Driver
0,Jacky Jackson,38,True
1,Steven Stevenson,25,False
2,Mooney Mooney,40,True


In [None]:
#2. You want to view some characteristics  of a Data Frame


url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'

df = pd.read_csv(url)

df.head(2)

#Show Dimension 
print(df.shape)

#Additionally,  we can get descriptive statistics for any numeric columns using 
# describe

print(df.describe())

(1313, 6)
              Age     Survived      SexCode
count  756.000000  1313.000000  1313.000000
mean    30.397989     0.342727     0.351866
std     14.259049     0.474802     0.477734
min      0.170000     0.000000     0.000000
25%     21.000000     0.000000     0.000000
50%     28.000000     0.000000     0.000000
75%     39.000000     1.000000     1.000000
max     71.000000     1.000000     1.000000


In [None]:
df.iloc[0]

Name        Allen, Miss Elisabeth Walton
PClass                               1st
Age                                 29.0
Sex                               female
Survived                               1
SexCode                                1
Name: 0, dtype: object

In [None]:
df.iloc[:4]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1


In [None]:
# To select individual rows and slices of rows, pandas provides two methods:
#1. loc is useful when  the index of the dataframe is a label (e.g string)
#2. iloc works by looking  for the position in the dataframe. For example iloc[0]
# will return  the first row regardless of wheter  the index is an integer or label

In [None]:
#3. You want to select DataFrame rows based on some condition

print(df[df['Sex']=='female'].head(4))

# Multiple condition are easy as well, For example, here we select all the rows where 
# the passenger is a female and 65 years or older.

df[(df['Sex'] == 'female') & (df['Age'] > 65)].head(10)

                                            Name PClass   Age     Sex  \
0                   Allen, Miss Elisabeth Walton    1st  29.0  female   
1                    Allison, Miss Helen Loraine    1st   2.0  female   
3  Allison, Mrs Hudson JC (Bessie Waldo Daniels)    1st  25.0  female   
6               Andrews, Miss Kornelia Theodosia    1st  63.0  female   

   Survived  SexCode  
0         1        1  
1         0        1  
3         0        1  
6         1        1  


Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
73,"Crosby, Mrs Edward Gifford (Catherine Elizabet...",1st,69.0,female,1,1


In [None]:
#4. You need to replace values in the data frame 
df['Sex'].replace('female','Women').head(5)

0    Women
1    Women
2     male
3    Women
4     male
Name: Sex, dtype: object

In [None]:
df['Sex'].replace(['female','male'],['Women','Man']).head(5)

0    Women
1    Women
2      Man
3    Women
4      Man
Name: Sex, dtype: object

In [None]:
df.replace(r"1st", "First",regex=True).head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",First,29.0,female,1,1
1,"Allison, Miss Helen Loraine",First,2.0,female,0,1


In [None]:
#5. You want to rename a column in a pandas DataFrame

df.rename(columns={'PClass': 'Passenger Class'}).head(2)

Unnamed: 0,Name,Passenger Class,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [None]:
df.rename(columns={'PClass': 'Passenger Class','Sex' : 'Gender'}).head(2)

Unnamed: 0,Name,Passenger Class,Age,Gender,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [None]:
import collections

column_names = collections.defaultdict(str)

#create keys 
for name in df.columns:
  column_names[name]

column_names

defaultdict(str,
            {'Age': '',
             'Name': '',
             'PClass': '',
             'Sex': '',
             'SexCode': '',
             'Survived': ''})

In [None]:
#6. finding min,max,sum,count,average,mean
# Calculate statistics
print('Maximum:', df['Age'].max())
print('Minimum:', df['Age'].min())
print('Mean:', df['Age'].mean())
print('Sum:', df['Age'].sum())
print('Count:', df['Age'].count())


#  In addition to the statistics used above, pandas offers 
#variance(var), standard deviation (std), Kurtosis(kurt),
# standard error of the mean(sem),mode,median 

df.count()

Maximum: 71.0
Minimum: 0.17
Mean: 30.397989417989415
Sum: 22980.88
Count: 756


Name        1313
PClass      1313
Age          756
Sex         1313
Survived    1313
SexCode     1313
dtype: int64

In [None]:
#7. Finding Unique values in a column 

print(df['Sex'].unique())

#Alternatively, value_counts will display all unique  values with the 
#number of times each value appears. 

print(df['Sex'].value_counts())

print(df['PClass'].value_counts())

# Counting the number of unique classes

print(df['PClass'].nunique())

['female' 'male']
male      851
female    462
Name: Sex, dtype: int64
3rd    711
1st    322
2nd    279
*        1
Name: PClass, dtype: int64
4


In [None]:
#8. Handling missing values in a Dataframe
# isnull and notnull returns boolean indicating whether a value is missing 


df[df['Age'].isnull()].head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
12,"Aubert, Mrs Leontine Pauline",1st,,female,1,1
13,"Barkworth, Mr Algernon H",1st,,male,1,0


In [None]:
import numpy as np
df = pd.read_csv(url,na_values=[np.nan,None,-999])

df[df['Age'].isnull()]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
12,"Aubert, Mrs Leontine Pauline",1st,,female,1,1
13,"Barkworth, Mr Algernon H",1st,,male,1,0
14,"Baumann, Mr John D",1st,,male,0,0
29,"Borebank, Mr John James",1st,,male,0,0
32,"Bradley, Mr George",1st,,male,1,0
...,...,...,...,...,...,...
1300,"Wiseman, Mr Phillippe",3rd,,male,0,0
1302,"Yalsevac, Mr Ivan",3rd,,male,1,0
1305,"Youssef, Mr Gerios",3rd,,male,0,0
1306,"Zabour, Miss Hileni",3rd,,female,0,1


In [None]:
#9. Deleting a column 
df.drop(['Age','Sex'],axis=1).head(2)


Unnamed: 0,Name,PClass,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,1,1
1,"Allison, Miss Helen Loraine",1st,0,1


In [None]:
# If a column does not have  a name, you can drop it by 
# its column number using dataframe.columns
df_dropped = df.drop(df.columns[0],axis=1)

In [None]:
#10. Deleting a row 
# Use a boolean condition to create a new Dataframe excluding the rows, you want
# to delete



In [None]:
#11. Dropping Duplicate rows 
df.drop_duplicates().head(2)

# A keen reader will notice that the solution didn’t actually drop any rows:

# Show number of rows
print("Number Of Rows In The Original DataFrame:", len(df))
print("Number Of Rows After Deduping:", len(df.drop_duplicates()))

"""
The reason is because drop_duplicates() defaults  to only dropping rows
that match perfectly  across all columns. Under this condition, every row 
in the dataframe , is actually unique. However, often we want to check only 
a subset of columns to check duplicate rows.
"""

print(df.drop_duplicates(subset=['Sex']))

import numpy as np
df = pd.read_csv(url,na_values=[np.nan,None,-999])

df[df['Age'].isnull()].head(2)


df.drop_duplicates(subset=['Sex'],keep='last')



Number Of Rows In The Original DataFrame: 1313
Number Of Rows After Deduping: 1313
                                  Name PClass   Age     Sex  Survived  SexCode
0         Allen, Miss Elisabeth Walton    1st  29.0  female         1        1
2  Allison, Mr Hudson Joshua Creighton    1st  30.0    male         0        0


Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
1307,"Zabour, Miss Tamini",3rd,,female,0,1
1312,"Zimmerman, Leo",3rd,29.0,male,0,0


In [None]:
#12. You want to group individual rows according to some shared feature. 
# groupby is one of the most powerful features in pandas 

df = pd.read_csv(url)

df.groupby('Sex').mean()

"""
GroupBy needs to be paired  with some  operation we want to apply  to
each group, such as calculating an aggregate statistics(mean, median,sum)
"""
df.groupby('Survived')['Name'].count()

Survived
0    863
1    450
Name: Name, dtype: int64

In [None]:
# We can also groupby by first column, then group than grouping by a second column 

df.groupby(['Sex','Survived'])['Age'].count()

Sex     Survived
female  0            71
        1           217
male    0           372
        1            96
Name: Age, dtype: int64

In [None]:
#13. Iterate over a column
#  You want to iterate over every element  in a column and apply some action 

# print first two names in upper case
for name in df['Name'][0:2]:
  print(name.upper())

ALLEN, MISS ELISABETH WALTON
ALLISON, MISS HELEN LORAINE


In [None]:
# List comprehensions 
[name.upper() for name in df['Name'][0:2]]

['ALLEN, MISS ELISABETH WALTON', 'ALLISON, MISS HELEN LORAINE']

In [None]:
#14. Applying a function Over all elements in a column 

#create a function 
def uppercase(x):
  return x.upper()

df["Name"].apply(uppercase)[0:2]

"""
apply is a great way to do data cleaning and wrangling. It is common to write a function
to perform some useful operation (separate first and last names, convert strings
to floats, etc.) and then map that function to every element in a column.
"""

0    ALLEN, MISS ELISABETH WALTON
1     ALLISON, MISS HELEN LORAINE
Name: Name, dtype: object

In [None]:
#15. Applying a function to groups 
# you have grouped rows using groupBy and want to apply a function to each group
"""
By combining groupby and apply we can calculate custom
statistics or apply any function to each group separately.
"""

df.groupby('Sex').apply(lambda x: x.count())

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,462,462,288,462,462,462
male,851,851,468,851,851,851


In [None]:
#16. Concatenating DataFrame

#Create Dataframe
data_a  = {'id': ['1','2','3'],
           'first' : ['Alex','Amy','Allen'],
           'last' : ['Anderson', 'Ackerman','Ali']}

df_a = pd.DataFrame(data_a, columns=['id','first','last'])

data_b  = {'id': ['4','5','6'],
           'first' : ['Billy','Brian','Bran'],
           'last' : ['Bonder', 'Black','Balwer']}

df_b = pd.DataFrame(data_b, columns=['id','first','last'])

pd.concat([df_a,df_b],axis=0)

Unnamed: 0,id,first,last
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwer


In [None]:
pd.concat([df_a,df_b],axis=1)

Unnamed: 0,id,first,last,id.1,first.1,last.1
0,1,Alex,Anderson,4,Billy,Bonder
1,2,Amy,Ackerman,5,Brian,Black
2,3,Allen,Ali,6,Bran,Balwer


In [None]:
# Alternatively, we can use  append to use a new row to a DataFrame

row = pd.Series([10,'Chris','Chilion'],index=['id','first','last'])

df_a.append(row,ignore_index=True)

Unnamed: 0,id,first,last
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,10,Chris,Chilion
