# What is Pandas?
## 
Pandas is a Python library used for working with data sets## 

It has functions for analyzing, cleaning, exploring, and manipulating daa## .

The name "Pandas" has a reference to both "Panel Data", and "Python Data Analysis" and was created by Wes McKinney in 2008.



In [2]:
import pandas as pd

In [3]:
mydataset = {
  'cars': ["BMW", "Volvo", "Ford"],
  'passings': [3, 7, 2]
}

myvar = pd.DataFrame(mydataset)

print(myvar)

    cars  passings
0    BMW         3
1  Volvo         7
2   Ford         2


In [4]:
print(pd.__version__)

2.1.4


# What is a Series?

A Pandas Series is like a column in a table
.

It is a one-dimensional array holding data of any type.

In [5]:
a = [1, 7, 2]

myvar = pd.Series(a)

print(myvar)

0    1
1    7
2    2
dtype: int64


Array is a linear data structure where all elements are arranged sequentially. 
It is a collection of elements of same data type stored at contiguous memory locations.

In [6]:
myvar[0]

1

In [7]:
# Creating your own labels

a = [1, 7, 2]

myvar = pd.Series(a, index = ["x", "y", "z"])

print(myvar)

x    1
y    7
z    2
dtype: int64


In [8]:
myvar["y"]

7

In [9]:
calories = {"day1": 420, "day2": 380, "day3": 390}

df = pd.Series(calories)

print(df)

day1    420
day2    380
day3    390
dtype: int64


In [10]:
calories = {"day1": 420, "day2": 380, "day3": 390}

myvar = pd.Series(calories, index = ["day1", "day2"])

print(myvar)

day1    420
day2    380
dtype: int64


## DataFrames
Data sets in Pandas are usually multi-dimensional tables, called DataFrames.

Series is like a column, a DataFrame is the whole table.

In [11]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

myvar = pd.DataFrame(data)

print(myvar)

   calories  duration
0       420        50
1       380        40
2       390        45


In [16]:
# Creating a DataFrame from a dictionary
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Los Angeles', 'Chicago']
}
df = pd.DataFrame(data)

# Creating a DataFrame from a list of lists
data = [
    [1, 'Alice', 25, "New York"],
    [2, 'Bob', 30, "Los Angeles"],
    [3, 'Charlie', 35, "Chicago"]
]
df = pd.DataFrame(data, columns=['ID', 'Name', 'Age', 'City'])

In [17]:
df

Unnamed: 0,ID,Name,Age,City
0,1,Alice,25,New York
1,2,Bob,30,Los Angeles
2,3,Charlie,35,Chicago


In [14]:
# Accessing a single column
print(df['Name'])

0      Alice
1        Bob
2    Charlie
Name: Name, dtype: object


In [18]:
# Accessing multiple columns
print(df[['Name', 'City']])

      Name         City
0    Alice     New York
1      Bob  Los Angeles
2  Charlie      Chicago


In [19]:
# Accessing a single row by position
print(df.iloc[0])  # First row

ID             1
Name       Alice
Age           25
City    New York
Name: 0, dtype: object


In [20]:
# Accessing rows by condition
print(df[df['Age'] > 30])

   ID     Name  Age     City
2   3  Charlie   35  Chicago


In [21]:
#refer to the row index:
print(df.loc[0])

ID             1
Name       Alice
Age           25
City    New York
Name: 0, dtype: object


In [22]:
#use a list of indexes:
print(df.loc[[0, 1]])

   ID   Name  Age         City
0   1  Alice   25     New York
1   2    Bob   30  Los Angeles


In [23]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

df = pd.DataFrame(data, index = ["day1", "day2", "day3"])

print(df)

      calories  duration
day1       420        50
day2       380        40
day3       390        45


In [24]:
#refer to the named index:
print(df.loc["day2"])

calories    380
duration     40
Name: day2, dtype: int64


iloc: Use when you need to select data by integer-based position (e.g., the first row or the second column).
loc: Use when you need to select data by label-based indexing (e.g., the row labeled 'row2' or the column named 'B').

In [26]:
df = pd.read_csv("MOCK.csv")

In [27]:
df

Unnamed: 0,id,first_name,last_name,email,gender,Salary
0,1,Gilda,Adlington,gadlington0@blogspot.com,Female,160445.616
1,2,Cristin,Le Leu,cleleu1@latimes.com,Female,149586.179
2,3,Rocky,Hawker,rhawker2@scribd.com,Male,244327.161
3,4,Burtie,Rantoull,brantoull3@economist.com,Male,98424.791
4,5,Dion,Strephan,dstrephan4@oaic.gov.au,Female,53230.053
...,...,...,...,...,...,...
995,996,Orelle,Gipson,ogipsonrn@privacy.gov.au,Female,48156.976
996,997,Dianne,Syrad,dsyradro@nydailynews.com,Female,124447.965
997,998,Tammy,Thornebarrow,tthornebarrowrp@vk.com,Female,233226.326
998,999,Almeta,Kilgallon,akilgallonrq@netlog.com,Genderqueer,108324.909


In [28]:
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,Salary
0,1,Gilda,Adlington,gadlington0@blogspot.com,Female,160445.616
1,2,Cristin,Le Leu,cleleu1@latimes.com,Female,149586.179
2,3,Rocky,Hawker,rhawker2@scribd.com,Male,244327.161
3,4,Burtie,Rantoull,brantoull3@economist.com,Male,98424.791
4,5,Dion,Strephan,dstrephan4@oaic.gov.au,Female,53230.053


In [29]:
df.tail()

Unnamed: 0,id,first_name,last_name,email,gender,Salary
995,996,Orelle,Gipson,ogipsonrn@privacy.gov.au,Female,48156.976
996,997,Dianne,Syrad,dsyradro@nydailynews.com,Female,124447.965
997,998,Tammy,Thornebarrow,tthornebarrowrp@vk.com,Female,233226.326
998,999,Almeta,Kilgallon,akilgallonrq@netlog.com,Genderqueer,108324.909
999,1000,Bibbye,McKyrrelly,bmckyrrellyrr@facebook.com,Female,56366.396


In [32]:
# to see the full data
print(df.to_string())

       id   first_name        last_name                                email       gender      Salary
0       1        Gilda        Adlington             gadlington0@blogspot.com       Female  160445.616
1       2      Cristin           Le Leu                  cleleu1@latimes.com       Female  149586.179
2       3        Rocky           Hawker                  rhawker2@scribd.com         Male  244327.161
3       4       Burtie         Rantoull             brantoull3@economist.com         Male   98424.791
4       5         Dion         Strephan               dstrephan4@oaic.gov.au       Female   53230.053
5       6       Joanne       Darbishire                 jdarbishire5@wsj.com       Female  126432.839
6       7         Isak         Lafrentz               ilafrentz6@tinyurl.com         Male  218744.647
7       8       Rutter       Newbigging                 rnewbigging7@mac.com         Male   76300.856
8       9        Locke         Guilloux             lguilloux8@newyorker.com      

In [37]:
# load dataset from anywhere in the computer
df_1 = pd.read_csv("C:/Users/SkillCircle/Downloads/GUJARAT (1).csv")

In [38]:
df_1.head()

Unnamed: 0,State,FIRST NAME,LAST NAME,USER NAME,EMAIL,PHONE,TOTAL ORDERS,CUSTOMER AOV,LAST ORDER DATE,PRODUCT TYPES,STORE AOV
0,Gujarat,Ghanshyam,Bhanderi,Ghanshyam Bhanderi,ghanshyam6651@gmail.com,920000000000.0,1,24228.0,2024-02-20T23:59:59.000000Z,Soundbar,1329.34
1,Gujarat,Hanif,Khan,Hanif Khan,khanhanif91@yahoo.in,920000000000.0,1,22649.0,2024-05-01T23:59:59.000000Z,,1329.34
2,Gujarat,Parag,Jogani,Parag Jogani,shahscreations@yahoo.co.in,919000000000.0,1,14048.0,2024-01-26T23:59:59.000000Z,Soundbar,1329.34
3,Gujarat,Ghanshyam,Bhutra,Ghanshyam Bhutra,gdbmaruthi@gmail.com,919000000000.0,1,13098.0,2024-01-08T23:59:59.000000Z,"Party Speaker,BT Speaker",1329.34
4,Gujarat,Valkesh,Prajapati,Valkesh Prajapati,valkeshprajapati1982@gmail.com,919000000000.0,1,11299.0,2024-02-02T23:59:59.000000Z,Speaker,1329.34


In [36]:
df_1

Unnamed: 0,State,FIRST NAME,LAST NAME,USER NAME,EMAIL,PHONE,TOTAL ORDERS,CUSTOMER AOV,LAST ORDER DATE,PRODUCT TYPES,STORE AOV
0,Gujarat,Ghanshyam,Bhanderi,Ghanshyam Bhanderi,ghanshyam6651@gmail.com,9.200000e+11,1,24228.0,2024-02-20T23:59:59.000000Z,Soundbar,1329.34
1,Gujarat,Hanif,Khan,Hanif Khan,khanhanif91@yahoo.in,9.200000e+11,1,22649.0,2024-05-01T23:59:59.000000Z,,1329.34
2,Gujarat,Parag,Jogani,Parag Jogani,shahscreations@yahoo.co.in,9.190000e+11,1,14048.0,2024-01-26T23:59:59.000000Z,Soundbar,1329.34
3,Gujarat,Ghanshyam,Bhutra,Ghanshyam Bhutra,gdbmaruthi@gmail.com,9.190000e+11,1,13098.0,2024-01-08T23:59:59.000000Z,"Party Speaker,BT Speaker",1329.34
4,Gujarat,Valkesh,Prajapati,Valkesh Prajapati,valkeshprajapati1982@gmail.com,9.190000e+11,1,11299.0,2024-02-02T23:59:59.000000Z,Speaker,1329.34
...,...,...,...,...,...,...,...,...,...,...,...
7572,MP,Anurag,Jaiswal,Anurag Jaiswal,7911anurag@gmail.com,9.200000e+11,1,0.0,2024-04-25T23:59:59.000000Z,,1329.34
7573,MP,Lokesh,Lokesh,Lokesh Lokesh,lokeshsahu3793@gmail.com,9.180000e+11,1,0.0,2024-04-22T23:59:59.000000Z,,1329.34
7574,MP,DURGESH,DURGESH,DURGESH DURGESH,lokeshsahu97.ls@gmail.com,9.190000e+11,1,0.0,2024-04-24T23:59:59.000000Z,,1329.34
7575,MP,Nikky,bodkhe,Nikky bodkhe,Dhotechetan04@gmail.com,9.200000e+11,1,0.0,2024-04-29T23:59:59.000000Z,,1329.34


In [39]:
print(pd.options.display.max_rows)

60


In [41]:
pd.options.display.max_rows = 8000

df_2 = pd.read_csv('C:/Users/SkillCircle/Downloads/GUJARAT (1).csv')

print(df_2) 

            State           FIRST NAME  \
0         Gujarat            Ghanshyam   
1         Gujarat                Hanif   
2         Gujarat                Parag   
3         Gujarat            Ghanshyam   
4         Gujarat              Valkesh   
5         Gujarat                    D   
6         Gujarat               chevli   
7         Gujarat               bharat   
8         Gujarat           Pruthviraj   
9         Gujarat             Kotadiya   
10        Gujarat              Swapnil   
11        Gujarat                Jigar   
12        Gujarat                Zahir   
13        Gujarat                Kanha   
14        Gujarat                SAPAN   
15        Gujarat                Yagna   
16        Gujarat               Ashish   
17        Gujarat             dharmesh   
18        Gujarat              Sanjeet   
19        Gujarat               Harish   
20        Gujarat               Sachin   
21        Gujarat               Pathan   
22        Gujarat              Sau

In [42]:
print(df_2.to_string())

            State           FIRST NAME                                            LAST NAME                                                   USER NAME                                    EMAIL         PHONE  TOTAL ORDERS  CUSTOMER AOV              LAST ORDER DATE             PRODUCT TYPES  STORE AOV
0         Gujarat            Ghanshyam                                             Bhanderi                                          Ghanshyam Bhanderi                  ghanshyam6651@gmail.com  9.200000e+11             1      24228.00  2024-02-20T23:59:59.000000Z                  Soundbar    1329.34
1         Gujarat                Hanif                                                 Khan                                                  Hanif Khan                     khanhanif91@yahoo.in  9.200000e+11             1      22649.00  2024-05-01T23:59:59.000000Z                       NaN    1329.34
2         Gujarat                Parag                                               Jogani      

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          1000 non-null   int64  
 1   first_name  1000 non-null   object 
 2   last_name   1000 non-null   object 
 3   email       1000 non-null   object 
 4   gender      1000 non-null   object 
 5   Salary      1000 non-null   float64
dtypes: float64(1), int64(1), object(4)
memory usage: 47.0+ KB


In [45]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7577 entries, 0 to 7576
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   State            7576 non-null   object 
 1   FIRST NAME       7577 non-null   object 
 2   LAST NAME        7478 non-null   object 
 3   USER NAME        7577 non-null   object 
 4   EMAIL            7570 non-null   object 
 5   PHONE            7577 non-null   float64
 6   TOTAL ORDERS     7577 non-null   int64  
 7   CUSTOMER AOV     7577 non-null   float64
 8   LAST ORDER DATE  7577 non-null   object 
 9   PRODUCT TYPES    3758 non-null   object 
 10  STORE AOV        7577 non-null   float64
dtypes: float64(3), int64(1), object(7)
memory usage: 651.3+ KB
