## Indroduction to Pandas

# What is pandas:
$ Pandas$ is a powerful and popular Python library designed for $data$ $manipulation$ (cleaning, transforming and structuring data) and data analysis (finding patterns, trends and insights).

In [26]:
import pandas as pd


# Key Pandas Concepts
# Series

- $Series$ is one-dimensional labeled array that can hold any data type: integers. floats, string, or even Python objects. Each element in the $Series$ has a unique label called as Index.
- It is often used torack changes or patterns over time, such as daily temperatures, stock prizes or sales revenue.

# Data Frame
- A $DataFrame$ is a two-dimensional laveled data-structure in Pandas, similar to a tavle in a database, an Excel spreadsheet, or a SQL tabel.
- It consistgs of rows and column, where:

   a. Rows have indices (labels).
   
   b. Columns have names (labels).

In [27]:
s = pd.Series([1,2,3,4,5])
print(s)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [28]:
s = pd.Series([1,2,2,5,5], index=['a','b','c','d','e'])
print(s)

a    1
b    2
c    2
d    5
e    5
dtype: int64


In [29]:
df = pd.DataFrame({"name": ["harry", 'mohan', 'rohan'], "marks": [78,67,98]})
print(df)

    name  marks
0  harry     78
1  mohan     67
2  rohan     98


Renaming the heading of each column
- df.rename(column={})

# Merging two data frames

In [30]:
df1 = pd.DataFrame({"name": ['abhishek', 'shreya', 'anjali','divya','rinki'],
                    'marks': [65,75,34,54,45]})
df1

Unnamed: 0,name,marks
0,abhishek,65
1,shreya,75
2,anjali,34
3,divya,54
4,rinki,45


In [31]:
df2 = pd.DataFrame({'name':["rohan",'mohan', 'sohan','mayank', 'bipin'],
                    'marks': [53,63,23,67,32] })
df2

Unnamed: 0,name,marks
0,rohan,53
1,mohan,63
2,sohan,23
3,mayank,67
4,bipin,32


In [32]:
df_new = pd.concat([df1,df2])
df_new

Unnamed: 0,name,marks
0,abhishek,65
1,shreya,75
2,anjali,34
3,divya,54
4,rinki,45
0,rohan,53
1,mohan,63
2,sohan,23
3,mayank,67
4,bipin,32


Merging on the basis of the common values. Here in the df1 and df2 will merge on the basis of common values exist in the both data frames.

In [33]:
df3 = pd.DataFrame({'name': ['rohan','moahan','sohan','abhishek','shreya'],
                   'roll.no': [12,13,14,15,16]})
pd.merge(df1,df3,on='name')


Unnamed: 0,name,marks,roll.no
0,abhishek,65,15
1,shreya,75,16


## Saving a file
for saaving files in other data types we can use:

- df_3.to_xlxs("output.xlxs", index = False)
- df_3.to_jnson("output.jnson", index = False)

In [34]:
df3.to_csv("Intro.csv", index=False)

# Creating a new column

In [35]:
df_new['Grade'] = ['A','B','C+','B+','A+','A','B','A++','F','D'] # if the len of nes column is less than len of name and marks then it will return us error.
df_new # THIS INSERT A NEW COLUMN AT THE END OF THE TABLE

Unnamed: 0,name,marks,Grade
0,abhishek,65,A
1,shreya,75,B
2,anjali,34,C+
3,divya,54,B+
4,rinki,45,A+
0,rohan,53,A
1,mohan,63,B
2,sohan,23,A++
3,mayank,67,F
4,bipin,32,D


Creating a new column at specific index.

In [36]:
df_new.insert(0, "Roll_no",[1,2,3,4,5,6,7,8,9,10])
df_new

Unnamed: 0,Roll_no,name,marks,Grade
0,1,abhishek,65,A
1,2,shreya,75,B
2,3,anjali,34,C+
3,4,divya,54,B+
4,5,rinki,45,A+
0,6,rohan,53,A
1,7,mohan,63,B
2,8,sohan,23,A++
3,9,mayank,67,F
4,10,bipin,32,D


# Creating a new dataframe from the existing dataframes

In [37]:
# let we a dataframes be:

# First DataFrame
df1 = pd.DataFrame({
    'roll_no': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [20, 21, 19]
})

# Second DataFrame
df2 = pd.DataFrame({
    'marks': [85, 92, 78],
    'grade': ['B', 'A', 'C'],
    'status': ['Pass', 'Pass', 'Pass']
})


In [38]:
df_3 = pd.DataFrame({
    'roll_no': df1['roll_no'],
    'name': df1['name'],
    'marks': df2['marks'],
    'grade': df2['grade']
})
df_3

Unnamed: 0,roll_no,name,marks,grade
0,1,Alice,85,B
1,2,Bob,92,A
2,3,Charlie,78,C


# Data Exploration of file:
- Accessing the first and last rows 
    - df.head() # gives first 5 rows by default
    - df.tail() # gives 5 last rows by default
- Accessing data type stored in each rows.
    - df.info() # returns the rows feild(heading) and its data-type
- Accessing the name of the $Columns$.
    - df.column
- Size of data set.
    - df.shape

In [39]:
data ={
    'Name': ['ram', 'syam','rahul','mohit', 'susheel','ajay','satyam','naveen', 'anshika', 'pranjali', 'akhand', 'ankanksha','sneha'],
    "Age": [21,22,32,23,34,25,35,22,23,52,22,25,23],
    "salary": [45000,23000,45000,12000,50000,42000,52000,51000,55000,48000,48000,49000,51000]
    }

In [40]:
df = pd.DataFrame(data)
df.head(2) # returns first 2 rows



Unnamed: 0,Name,Age,salary
0,ram,21,45000
1,syam,22,23000


In [41]:
df.tail(2) # returns last 2 rows

Unnamed: 0,Name,Age,salary
11,ankanksha,25,49000
12,sneha,23,51000


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    13 non-null     object
 1   Age     13 non-null     int64 
 2   salary  13 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 440.0+ bytes


# Aggregation
- df['column_name'].sum() # returns the total sum of the column.
- df['column_name'].min() # returns the min value of the column.
- df['column_name'].max() # returns the max value of the column. 
- df['column_name'].mean() # returns the mean value to the column.

In [57]:
df['Age'].std()

np.float64(8.780076513888908)

In [44]:
df.describe() # this gives us the descriptive statistics of the data (like: mean, std, min_value, max_value, etc)

Unnamed: 0,Age,salary
count,13.0,13.0
mean,27.615385,43923.076923
std,8.780077,12406.057247
min,21.0,12000.0
25%,22.0,45000.0
50%,23.0,48000.0
75%,32.0,51000.0
max,52.0,55000.0


In [45]:
print(df.shape)
print(df.columns)

(13, 3)
Index(['Name', 'Age', 'salary'], dtype='object')


## Data Manipulation
- Selection of a specific column.
    - df['Column_name'] # for accessing single column
    - df[['column_1', 'column_2', '...']]  # accessing multiple column
    - df.iloc[:, 1] # returns the 2nd column
    - df.iloc[:, [0,1]] # returns the 1st and 2nd column


- Accessing rows
    - df[df["column_Name"] > "conditional value"] # for single condition
    - df[(df["column_Name"] > "1st conditional value) & (df["column_Name"] > "2nd conditional value")]
    - df[(df["column_Name"] > "1st conditional value) | (df["column_Name"] > "2nd conditional value")]

In [46]:
column = df["Age"] 
mut_col = df[['Age','salary']]



In [47]:
print(df.iloc[[0,2,5]]) # returns 1st 3rd and 6th row.
print(df.iloc[6:10]) # returns rows no from 1 to 4

    Name  Age  salary
0    ram   21   45000
2  rahul   32   45000
5   ajay   25   42000
       Name  Age  salary
6    satyam   35   52000
7    naveen   22   51000
8   anshika   23   55000
9  pranjali   52   48000


In [48]:
# filtering rows (using conditonal statements )
df[df['Age']> 23]

Unnamed: 0,Name,Age,salary
2,rahul,32,45000
4,susheel,34,50000
5,ajay,25,42000
6,satyam,35,52000
9,pranjali,52,48000
11,ankanksha,25,49000


In [49]:
df[(df["Age"]>25) & (df["salary"]>45000)] # and❌, &✅

Unnamed: 0,Name,Age,salary
4,susheel,34,50000
6,satyam,35,52000
9,pranjali,52,48000


In [50]:
df[(df["Age"]>28) | (df["salary"]>51000)] # or❌, | ✅

Unnamed: 0,Name,Age,salary
2,rahul,32,45000
4,susheel,34,50000
6,satyam,35,52000
8,anshika,23,55000
9,pranjali,52,48000


In [51]:
df.iloc[[1,3],[1,2]]

Unnamed: 0,Age,salary
1,22,23000
3,23,12000
