In [1]:
import numpy as np
import pandas as pd

## Sorting values

In [2]:
df = pd.DataFrame({"Names" : ['Mahesh','Suresh','Naresh','Prakash','Arjun'],
              "Marks": [66,34,45,66,55],
             "Grade": ['A','C','B','A','B'],
             "Age": [23,19,22,21,20]})

In [3]:
df

Unnamed: 0,Names,Marks,Grade,Age
0,Mahesh,66,A,23
1,Suresh,34,C,19
2,Naresh,45,B,22
3,Prakash,66,A,21
4,Arjun,55,B,20


In [4]:
df.sort_values(by ='Marks', ascending = False, ignore_index = True, inplace = True)

In [5]:
df

Unnamed: 0,Names,Marks,Grade,Age
0,Mahesh,66,A,23
1,Prakash,66,A,21
2,Arjun,55,B,20
3,Naresh,45,B,22
4,Suresh,34,C,19


In [6]:
df.sort_values(by= 'Age', ignore_index = True)

Unnamed: 0,Names,Marks,Grade,Age
0,Suresh,34,C,19
1,Arjun,55,B,20
2,Prakash,66,A,21
3,Naresh,45,B,22
4,Mahesh,66,A,23


In [7]:
df.sort_values(by = ['Marks','Age'],ascending = [True,True], ignore_index = True)

Unnamed: 0,Names,Marks,Grade,Age
0,Suresh,34,C,19
1,Naresh,45,B,22
2,Arjun,55,B,20
3,Prakash,66,A,21
4,Mahesh,66,A,23


## Map, Apply and Between

In [9]:
df = pd.read_csv(r"C:\Users\91771\Desktop\Innomatic\EDA\Pandas\Datasets\loandata.csv")

In [10]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001722,Male,Yes,0,Graduate,No,150,1800.0,135.0,360.0,1.0,Rural,N
1,LP002502,Female,Yes,2,Not Graduate,,210,2917.0,98.0,360.0,1.0,Semiurban,Y
2,LP002949,Female,No,3+,Graduate,,416,41667.0,350.0,180.0,,Urban,N
3,LP002603,Female,No,0,Graduate,No,645,3683.0,113.0,480.0,1.0,Rural,Y
4,LP001644,,Yes,0,Graduate,Yes,674,5296.0,168.0,360.0,1.0,Rural,Y


### Apply 
- To transform the data. (from categorical <-> numerical)
- Expect **Function**

#### Biserial Correlation
- Finds the relationship between categorical and numerical

In [11]:
pip install scipy

Note: you may need to restart the kernel to use updated packages.


In [12]:
from scipy.stats import pointbiserialr

In [13]:
def gender(x):
    if x == "Male":
        return 1
    else:
        return 0

In [14]:
df['Gender'] = df['Gender'].apply(gender) # when we have no inplace variable, just assign to the same thing .

In [15]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001722,1,Yes,0,Graduate,No,150,1800.0,135.0,360.0,1.0,Rural,N
1,LP002502,0,Yes,2,Not Graduate,,210,2917.0,98.0,360.0,1.0,Semiurban,Y
2,LP002949,0,No,3+,Graduate,,416,41667.0,350.0,180.0,,Urban,N
3,LP002603,0,No,0,Graduate,No,645,3683.0,113.0,480.0,1.0,Rural,Y
4,LP001644,0,Yes,0,Graduate,Yes,674,5296.0,168.0,360.0,1.0,Rural,Y


In [16]:
corr,p_value = pointbiserialr(df['Gender'],df['ApplicantIncome'])

In [17]:
corr

0.013933403563473733

In [18]:
p_value

0.7304181759265278

In [19]:
def Property_Area(x):
    if x == "Rural":
        return 1
    elif x == "Semiurban":
        return 2
    else:
        return 3

In [20]:
df['Property_Area'] = df['Property_Area'].apply(Property_Area)

In [21]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001722,1,Yes,0,Graduate,No,150,1800.0,135.0,360.0,1.0,1,N
1,LP002502,0,Yes,2,Not Graduate,,210,2917.0,98.0,360.0,1.0,2,Y
2,LP002949,0,No,3+,Graduate,,416,41667.0,350.0,180.0,,3,N
3,LP002603,0,No,0,Graduate,No,645,3683.0,113.0,480.0,1.0,1,Y
4,LP001644,0,Yes,0,Graduate,Yes,674,5296.0,168.0,360.0,1.0,1,Y


In [22]:
def status(x):
    if x < 1000:
        return "POOR"
    elif x <6000:
        return "MIDDLE CLASS"
    else:
        return "RICH"

In [23]:
df['ApplicantIncome'] = df['ApplicantIncome'].apply(status)

In [24]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001722,1,Yes,0,Graduate,No,POOR,1800.0,135.0,360.0,1.0,1,N
1,LP002502,0,Yes,2,Not Graduate,,POOR,2917.0,98.0,360.0,1.0,2,Y
2,LP002949,0,No,3+,Graduate,,POOR,41667.0,350.0,180.0,,3,N
3,LP002603,0,No,0,Graduate,No,POOR,3683.0,113.0,480.0,1.0,1,Y
4,LP001644,0,Yes,0,Graduate,Yes,POOR,5296.0,168.0,360.0,1.0,1,Y


### Map
- To transform the data. (from categorical <-> numerical)
- In single line
- Expect **Dictionary**

In [25]:
df['Loan_Status'] = df['Loan_Status'].map({"N":0,"Y":1})

In [26]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001722,1,Yes,0,Graduate,No,POOR,1800.0,135.0,360.0,1.0,1,0
1,LP002502,0,Yes,2,Not Graduate,,POOR,2917.0,98.0,360.0,1.0,2,1
2,LP002949,0,No,3+,Graduate,,POOR,41667.0,350.0,180.0,,3,0
3,LP002603,0,No,0,Graduate,No,POOR,3683.0,113.0,480.0,1.0,1,1
4,LP001644,0,Yes,0,Graduate,Yes,POOR,5296.0,168.0,360.0,1.0,1,1


In [27]:
df['Education'] = df['Education'].map({"Graduate":1,"Not Graduate":0})

In [28]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001722,1,Yes,0,1,No,POOR,1800.0,135.0,360.0,1.0,1,0
1,LP002502,0,Yes,2,0,,POOR,2917.0,98.0,360.0,1.0,2,1
2,LP002949,0,No,3+,1,,POOR,41667.0,350.0,180.0,,3,0
3,LP002603,0,No,0,1,No,POOR,3683.0,113.0,480.0,1.0,1,1
4,LP001644,0,Yes,0,1,Yes,POOR,5296.0,168.0,360.0,1.0,1,1



### Between
- While encountering the between type of conditions, we can simply use .between()

In [29]:
df = pd.read_csv(r"C:\Users\91771\Desktop\Innomatic\EDA\Pandas\Datasets\loandata.csv")

In [30]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001722,Male,Yes,0,Graduate,No,150,1800.0,135.0,360.0,1.0,Rural,N
1,LP002502,Female,Yes,2,Not Graduate,,210,2917.0,98.0,360.0,1.0,Semiurban,Y
2,LP002949,Female,No,3+,Graduate,,416,41667.0,350.0,180.0,,Urban,N
3,LP002603,Female,No,0,Graduate,No,645,3683.0,113.0,480.0,1.0,Rural,Y
4,LP001644,,Yes,0,Graduate,Yes,674,5296.0,168.0,360.0,1.0,Rural,Y


In [31]:
sum((df["CoapplicantIncome"]>=2000) & (df["CoapplicantIncome"]<=4000))

144

In [32]:
sum(df["CoapplicantIncome"].between(2000,4000))

144

## Time Series (Date/Time)

In [33]:
date = pd.date_range(start = '5/8/2024', end ="5/15/2024") # MM/DD/YYYY

In [34]:
sales = np.random.randint(10,50,8)

In [35]:
df = pd.DataFrame({"date":date,
             "sales":sales})

In [65]:
df

Unnamed: 0,date,sales
0,2024-05-08,48
1,2024-05-09,12
2,2024-05-10,25
3,2024-05-11,14
4,2024-05-12,36
5,2024-05-13,29
6,2024-05-14,48
7,2024-05-15,33
