## Data Collection and Wrangling - Simple Example

## 1- Data Collection / Creation

In [1]:
# import libraries
import pandas as pd
import numpy as np

In [2]:
data = {"Name": ["Akbar", "Ali", "Elsa", "Jon", "Tom", "Kattie", "Huda"],
        "Age": [13, 14, 15, 14, 14, 18, 19],
        "Gender": ["M", "M", "F", "M", "M", "F", "F"],
        "Marks": [90, 76, 59, "NaN", 89, "NaN", "NaN"]       
}

In [3]:
# convert into DataFrame
df = pd.DataFrame(data)

In [4]:
df

Unnamed: 0,Name,Age,Gender,Marks
0,Akbar,13,M,90.0
1,Ali,14,M,76.0
2,Elsa,15,F,59.0
3,Jon,14,M,
4,Tom,14,M,89.0
5,Kattie,18,F,
6,Huda,19,F,


## 2- Assessment

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    7 non-null      object
 1   Age     7 non-null      int64 
 2   Gender  7 non-null      object
 3   Marks   7 non-null      object
dtypes: int64(1), object(3)
memory usage: 352.0+ bytes


In [6]:
df.dtypes

Name      object
Age        int64
Gender    object
Marks     object
dtype: object

In [7]:
df.head()

Unnamed: 0,Name,Age,Gender,Marks
0,Akbar,13,M,90.0
1,Ali,14,M,76.0
2,Elsa,15,F,59.0
3,Jon,14,M,
4,Tom,14,M,89.0


In [8]:
df

Unnamed: 0,Name,Age,Gender,Marks
0,Akbar,13,M,90.0
1,Ali,14,M,76.0
2,Elsa,15,F,59.0
3,Jon,14,M,
4,Tom,14,M,89.0
5,Kattie,18,F,
6,Huda,19,F,


In [9]:
df.tail(3)

Unnamed: 0,Name,Age,Gender,Marks
4,Tom,14,M,89.0
5,Kattie,18,F,
6,Huda,19,F,


In [10]:
df.sample(3)

Unnamed: 0,Name,Age,Gender,Marks
2,Elsa,15,F,59.0
6,Huda,19,F,
3,Jon,14,M,


## 3- Cleaning

In [11]:
new_df = df.replace("NaN", np.nan)

In [12]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    7 non-null      object 
 1   Age     7 non-null      int64  
 2   Gender  7 non-null      object 
 3   Marks   4 non-null      float64
dtypes: float64(1), int64(1), object(2)
memory usage: 352.0+ bytes


## 4- Dealing with Missing Values

In [13]:
# compute average
c = avg = 0
for ele in df["Marks"]:
    if str(ele).isnumeric():
        c += 1
        avg += ele
avg /= c

In [14]:
# replace missing values
df = df.replace(to_replace="NaN",
               value=avg)

In [15]:
# display data
df

Unnamed: 0,Name,Age,Gender,Marks
0,Akbar,13,M,90.0
1,Ali,14,M,76.0
2,Elsa,15,F,59.0
3,Jon,14,M,78.5
4,Tom,14,M,89.0
5,Kattie,18,F,78.5
6,Huda,19,F,78.5


In [16]:
new_df

Unnamed: 0,Name,Age,Gender,Marks
0,Akbar,13,M,90.0
1,Ali,14,M,76.0
2,Elsa,15,F,59.0
3,Jon,14,M,
4,Tom,14,M,89.0
5,Kattie,18,F,
6,Huda,19,F,


In [17]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    7 non-null      object 
 1   Age     7 non-null      int64  
 2   Gender  7 non-null      object 
 3   Marks   4 non-null      float64
dtypes: float64(1), int64(1), object(2)
memory usage: 352.0+ bytes


In [18]:
new_df.mean()

  new_df.mean()


Age      15.285714
Marks    78.500000
dtype: float64

In [19]:
new_df["Age"].mean()

15.285714285714286

In [20]:
new_df[["Age","Marks"]].mean()

Age      15.285714
Marks    78.500000
dtype: float64

In [22]:
new_df.mean(axis=1)

  new_df.mean(axis=1)


0    51.5
1    45.0
2    37.0
3    14.0
4    51.5
5    18.0
6    19.0
dtype: float64

In [23]:
new_df.mean(axis=1, skipna=False)

  new_df.mean(axis=1, skipna=False)


0    51.5
1    45.0
2    37.0
3     NaN
4    51.5
5     NaN
6     NaN
dtype: float64

## 5- Reshaping the Data

In [24]:
df["Gender"] = df["Gender"].map({"M": 0, 
                  "F": 1}).astype(float)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    7 non-null      object 
 1   Age     7 non-null      int64  
 2   Gender  7 non-null      float64
 3   Marks   7 non-null      float64
dtypes: float64(2), int64(1), object(1)
memory usage: 352.0+ bytes


## 6- Filtering

In [27]:
higher_std = df[df["Marks"] >=80]

In [28]:
higher_std

Unnamed: 0,Name,Age,Gender,Marks
0,Akbar,13,0.0,90.0
4,Tom,14,0.0,89.0


## 7- Delete or Dropping Column

In [34]:
new_drp = df.drop(["Gender", "Age"], axis=1)

In [35]:
new_drp

Unnamed: 0,Name,Marks
0,Akbar,90.0
1,Ali,76.0
2,Elsa,59.0
3,Jon,78.5
4,Tom,89.0
5,Kattie,78.5
6,Huda,78.5
