# Pandas: 
#### Pandas is an opensource data analysis library written in Python. leverages the power and speed of numpy to make data analysis and preprocessing easy for data scientist. It provides rich and highly robust data operations. Numpy written in C.

## Data structure in Pandas

#### Pandas has two main data structures that make working with data easy and intuitive:

### Series:

- A Series is like a list of values, but each value has a label, called an "index."
- Similar to single column of data with labeled rows.
- You can access values by their labels, making data retrieval quick and readable.


In [2]:
import pandas as pd

# Creating a Series
series = pd.Series([10, 20, 30], index=['A', 'B', 'C'])
print(type(series))
series


<class 'pandas.core.series.Series'>


A    10
B    20
C    30
dtype: int64

In [3]:
pd.__version__

'2.2.2'

### DataFrame:

- A DataFrame is like a table with rows and columns, where each column is a Series.
- It can hold data of different types (e.g., text, numbers) in each column.
- DataFrames have both row and column labels, which makes it easier to access and manipulate data based on these labels.

In [4]:
import pandas as pd

# Creating a DataFrame from a dictionary
data = {
    'Name': ['Navin', 'Sam', 'Charlie'],
    'Age': [32, 30, 35],
    'City': ['New York', 'Los Angeles', 'Chicago']
}
print(type(data))
df = pd.DataFrame(data)
df


<class 'dict'>


Unnamed: 0,Name,Age,City
0,Navin,32,New York
1,Sam,30,Los Angeles
2,Charlie,35,Chicago


In [5]:
type(df)

pandas.core.frame.DataFrame

In [6]:
# tells data types of each column
df.dtypes


Name    object
Age      int64
City    object
dtype: object

### Creating a large dataframe

In [7]:
import numpy as np
df1=pd.DataFrame(np.random.randn(1110,10),columns=['A','B','C','D','E','F','G','H','I','J'],index=[np.arange(1110)])

# Multiplying each value by 100
df1 = df1 * 100
df1.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,-14.340578,-5.228009,10.669108,-50.03189,-111.502423,76.090486,-92.946589,107.443851,67.4354,-34.648964
1,-78.137724,6.451642,-204.13765,-44.44607,-3.920353,-117.09608,-109.050666,73.259073,8.050844,117.33582
2,53.717717,-58.558068,-75.300158,-101.094662,-5.784238,260.540701,263.87562,57.646625,83.57856,-214.945546
3,198.947736,161.522812,-118.499216,75.688573,-58.280551,44.819885,70.936454,133.999289,-46.09428,-39.31888
4,-14.809407,-34.137635,155.57632,-77.981783,54.286133,52.308045,-70.644105,48.272009,25.038397,37.830688


In [8]:
df1.dtypes

A    float64
B    float64
C    float64
D    float64
E    float64
F    float64
G    float64
H    float64
I    float64
J    float64
dtype: object

In [9]:
#changing the data type of column A to Object from float
df1.at[0,'A']="gaurav"
df1.dtypes

  df1.at[0,'A']="gaurav"


A     object
B    float64
C    float64
D    float64
E    float64
F    float64
G    float64
H    float64
I    float64
J    float64
dtype: object

In [10]:
df1.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,gaurav,-5.228009,10.669108,-50.03189,-111.502423,76.090486,-92.946589,107.443851,67.4354,-34.648964
1,-78.137724,6.451642,-204.13765,-44.44607,-3.920353,-117.09608,-109.050666,73.259073,8.050844,117.33582
2,53.717717,-58.558068,-75.300158,-101.094662,-5.784238,260.540701,263.87562,57.646625,83.57856,-214.945546
3,198.947736,161.522812,-118.499216,75.688573,-58.280551,44.819885,70.936454,133.999289,-46.09428,-39.31888
4,-14.809407,-34.137635,155.57632,-77.981783,54.286133,52.308045,-70.644105,48.272009,25.038397,37.830688


In [11]:
df1.index

MultiIndex([(   0,),
            (   1,),
            (   2,),
            (   3,),
            (   4,),
            (   5,),
            (   6,),
            (   7,),
            (   8,),
            (   9,),
            ...
            (1100,),
            (1101,),
            (1102,),
            (1103,),
            (1104,),
            (1105,),
            (1106,),
            (1107,),
            (1108,),
            (1109,)],
           length=1110)

In [12]:
df1 = df1.reset_index(drop=True)
df1.index

RangeIndex(start=0, stop=1110, step=1)

In [13]:
df1.columns

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'], dtype='object')

In [14]:
#to convert dataframe into numpy array
df1.to_numpy()

array([['gaurav', -5.228008540922218, 10.66910792102057, ...,
        107.44385080648382, 67.43540031964905, -34.648964138556394],
       [-78.13772372451182, 6.451641549588645, -204.13764963379273, ...,
        73.25907286789402, 8.050844011709405, 117.33582023359239],
       [53.717716936821766, -58.558068102622954, -75.30015835379704, ...,
        57.64662480381551, 83.57855982569312, -214.94554601242885],
       ...,
       [-142.00440645038032, -147.54460857989125, -144.90628034789813,
        ..., 100.30225025783265, 93.05216776111747, -42.731504292105974],
       [-4.9381511562267955, 114.72789583857721, 69.59620202010444, ...,
        -110.40204964277017, 73.62544943411693, 44.83175220459031],
       [122.90429341966251, -160.6128888008743, 4.078333595861207, ...,
        -80.47155218254221, -79.56414534646129, 73.74482393521335]],
      dtype=object)

In [15]:
#Another Example
dict_new={
    "name":['Gaurav','Navin Sir','Harsh Bhaiya','Sushil'],
    "marks":[96,98,85,88],
    "city":['Gaya','Bengaluru','Jodhpur','Bikaner']
    
}

In [16]:
dframe=pd.DataFrame(dict_new)

In [17]:
dframe

Unnamed: 0,name,marks,city
0,Gaurav,96,Gaya
1,Navin Sir,98,Bengaluru
2,Harsh Bhaiya,85,Jodhpur
3,Sushil,88,Bikaner


In [18]:
#we can convert this data frame to any file format. Let's say we want to convert it into .csv
#index means row in general and column ko column hi bolte hain.
dframe.to_csv('office.csv',index=False)

In [19]:
dframe.head(2)

Unnamed: 0,name,marks,city
0,Gaurav,96,Gaya
1,Navin Sir,98,Bengaluru


In [20]:
dframe.tail(2)

Unnamed: 0,name,marks,city
2,Harsh Bhaiya,85,Jodhpur
3,Sushil,88,Bikaner


In [21]:
#it does a numerical analysis
dframe.describe()

Unnamed: 0,marks
count,4.0
mean,91.75
std,6.238322
min,85.0
25%,87.25
50%,92.0
75%,96.5
max,98.0


In [22]:
#to read a csv file
flight_details=pd.read_csv('flight_details.csv');

FileNotFoundError: [Errno 2] No such file or directory: 'flight_details.csv'

In [208]:
flight_details

Unnamed: 0,flight_no,price,destination
0,GS1234,9600,Gaya
1,NR1234,9800,Bengaluru
2,HB1234,8500,Jodhpur
3,SR1234,8800,Bikaner


In [209]:
flight_details['price']

0    9600
1    9800
2    8500
3    8800
Name: price, dtype: int64

In [210]:
flight_details['price'][2]=10000

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flight_details['price'][2]=10000


In [211]:
flight_details

Unnamed: 0,flight_no,price,destination
0,GS1234,9600,Gaya
1,NR1234,9800,Bengaluru
2,HB1234,10000,Jodhpur
3,SR1234,8800,Bikaner


### to change the index of data frame

In [212]:
flight_details.index=['first','second','third','fourth']

In [213]:
flight_details

Unnamed: 0,flight_no,price,destination
first,GS1234,9600,Gaya
second,NR1234,9800,Bengaluru
third,HB1234,10000,Jodhpur
fourth,SR1234,8800,Bikaner


# Pandas Attributes

In [49]:
#transpose
df.T

Unnamed: 0,0,1,2
Name,Navin,Sam,Charlie
Age,32,30,35
City,New York,Los Angeles,Chicago


In [50]:
# sort index in backward manner
#axis 0 is index and axis 1 is column
df1.sort_index(axis=0,ascending=False)

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
1109,79.932137,-58.609822,-112.096787,178.118322,-76.530894,-30.780197,-78.453961,144.628778,-105.841382,-4.319715
1108,66.378387,161.865408,142.183301,-9.831406,226.940990,-101.984401,-111.373192,-50.684074,-93.725421,-40.485899
1107,-11.101721,15.813809,136.650236,-59.415701,-69.268946,-123.160640,-30.030198,90.711070,-43.960390,-90.972889
1106,-23.861151,71.042965,-9.006008,92.864398,57.167073,-47.251723,134.775147,159.050804,43.650730,56.106725
1105,79.115375,-1.756113,-89.983294,-19.093529,-100.884644,81.590357,58.712724,-48.575769,5.054406,-0.232954
...,...,...,...,...,...,...,...,...,...,...
4,-28.725075,104.517189,-61.058046,85.316982,219.217761,-7.662565,41.651260,-4.291010,-116.938312,-138.594196
3,-180.05647,-102.665671,-30.039538,40.092439,146.216619,-11.949893,50.200077,-25.883205,30.322389,89.885739
2,16.985692,-15.204893,-78.825125,-109.544859,-81.474259,-91.625366,68.079077,260.641556,-124.352504,5.789724
1,-140.790067,-114.784799,-128.979703,86.107063,-30.619402,122.342235,-0.047714,-28.286929,65.979052,43.815885


In [52]:
df1['A']

0           gaurav
1      -140.790067
2        16.985692
3       -180.05647
4       -28.725075
           ...    
1105     79.115375
1106    -23.861151
1107    -11.101721
1108     66.378387
1109     79.932137
Name: A, Length: 1110, dtype: object

In [53]:
type(df1['A'])

pandas.core.series.Series

## View and Copy

In [25]:
#View
df2=df1
df2.at[0,'A']="Sharma"
df1
#you can see, changing df2 changes df1 as df2 is only a view of df1. so changes in df2 will lead to changes in df1.

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,Sharma,-5.228009,10.669108,-50.031890,-111.502423,76.090486,-92.946589,107.443851,67.435400,-34.648964
1,-78.137724,6.451642,-204.137650,-44.446070,-3.920353,-117.096080,-109.050666,73.259073,8.050844,117.335820
2,53.717717,-58.558068,-75.300158,-101.094662,-5.784238,260.540701,263.875620,57.646625,83.578560,-214.945546
3,198.947736,161.522812,-118.499216,75.688573,-58.280551,44.819885,70.936454,133.999289,-46.094280,-39.318880
4,-14.809407,-34.137635,155.576320,-77.981783,54.286133,52.308045,-70.644105,48.272009,25.038397,37.830688
...,...,...,...,...,...,...,...,...,...,...
1105,-200.781175,160.682101,-16.841627,36.628485,13.685852,-106.496468,-72.153336,-11.018913,-158.091697,130.215965
1106,-86.888182,49.720129,51.324219,92.176180,-75.210718,-161.194211,-75.754060,-26.477631,-123.918766,48.511527
1107,-142.004406,-147.544609,-144.906280,-128.710515,61.983856,134.302059,-22.573077,100.302250,93.052168,-42.731504
1108,-4.938151,114.727896,69.596202,-32.863036,52.796462,50.363067,29.828807,-110.402050,73.625449,44.831752


In [26]:
#Copy
df3=df1.copy()
df3.at[0,'A']="Java"
df1
#one can see that changes in the copied dataframe df3 does not affect df1.

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,Sharma,-5.228009,10.669108,-50.031890,-111.502423,76.090486,-92.946589,107.443851,67.435400,-34.648964
1,-78.137724,6.451642,-204.137650,-44.446070,-3.920353,-117.096080,-109.050666,73.259073,8.050844,117.335820
2,53.717717,-58.558068,-75.300158,-101.094662,-5.784238,260.540701,263.875620,57.646625,83.578560,-214.945546
3,198.947736,161.522812,-118.499216,75.688573,-58.280551,44.819885,70.936454,133.999289,-46.094280,-39.318880
4,-14.809407,-34.137635,155.576320,-77.981783,54.286133,52.308045,-70.644105,48.272009,25.038397,37.830688
...,...,...,...,...,...,...,...,...,...,...
1105,-200.781175,160.682101,-16.841627,36.628485,13.685852,-106.496468,-72.153336,-11.018913,-158.091697,130.215965
1106,-86.888182,49.720129,51.324219,92.176180,-75.210718,-161.194211,-75.754060,-26.477631,-123.918766,48.511527
1107,-142.004406,-147.544609,-144.906280,-128.710515,61.983856,134.302059,-22.573077,100.302250,93.052168,-42.731504
1108,-4.938151,114.727896,69.596202,-32.863036,52.796462,50.363067,29.828807,-110.402050,73.625449,44.831752


In [27]:
df3

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,Java,-5.228009,10.669108,-50.031890,-111.502423,76.090486,-92.946589,107.443851,67.435400,-34.648964
1,-78.137724,6.451642,-204.137650,-44.446070,-3.920353,-117.096080,-109.050666,73.259073,8.050844,117.335820
2,53.717717,-58.558068,-75.300158,-101.094662,-5.784238,260.540701,263.875620,57.646625,83.578560,-214.945546
3,198.947736,161.522812,-118.499216,75.688573,-58.280551,44.819885,70.936454,133.999289,-46.094280,-39.318880
4,-14.809407,-34.137635,155.576320,-77.981783,54.286133,52.308045,-70.644105,48.272009,25.038397,37.830688
...,...,...,...,...,...,...,...,...,...,...
1105,-200.781175,160.682101,-16.841627,36.628485,13.685852,-106.496468,-72.153336,-11.018913,-158.091697,130.215965
1106,-86.888182,49.720129,51.324219,92.176180,-75.210718,-161.194211,-75.754060,-26.477631,-123.918766,48.511527
1107,-142.004406,-147.544609,-144.906280,-128.710515,61.983856,134.302059,-22.573077,100.302250,93.052168,-42.731504
1108,-4.938151,114.727896,69.596202,-32.863036,52.796462,50.363067,29.828807,-110.402050,73.625449,44.831752


## Deep and Shallow Copy in Pandas


### Shallow Copy
#### A shallow copy creates a new DataFrame object, but it still references the original data. This means that some modifications to the shallow copy may affect the original DataFrame as both share the same data in memory.

 a shallow copy can be done using df4 = df1[:]

In [28]:
#shallow copy
df4=df1[:]
df4

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,Sharma,-5.228009,10.669108,-50.031890,-111.502423,76.090486,-92.946589,107.443851,67.435400,-34.648964
1,-78.137724,6.451642,-204.137650,-44.446070,-3.920353,-117.096080,-109.050666,73.259073,8.050844,117.335820
2,53.717717,-58.558068,-75.300158,-101.094662,-5.784238,260.540701,263.875620,57.646625,83.578560,-214.945546
3,198.947736,161.522812,-118.499216,75.688573,-58.280551,44.819885,70.936454,133.999289,-46.094280,-39.318880
4,-14.809407,-34.137635,155.576320,-77.981783,54.286133,52.308045,-70.644105,48.272009,25.038397,37.830688
...,...,...,...,...,...,...,...,...,...,...
1105,-200.781175,160.682101,-16.841627,36.628485,13.685852,-106.496468,-72.153336,-11.018913,-158.091697,130.215965
1106,-86.888182,49.720129,51.324219,92.176180,-75.210718,-161.194211,-75.754060,-26.477631,-123.918766,48.511527
1107,-142.004406,-147.544609,-144.906280,-128.710515,61.983856,134.302059,-22.573077,100.302250,93.052168,-42.731504
1108,-4.938151,114.727896,69.596202,-32.863036,52.796462,50.363067,29.828807,-110.402050,73.625449,44.831752


### Deep Copy
#### A deep copy creates a new DataFrame object with entirely independent data. Changes made to a deep copy won’t affect the original DataFrame, as it has a separate memory allocation.

In [29]:
#deep copy
df5 = df1.copy()
df5

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,Sharma,-5.228009,10.669108,-50.031890,-111.502423,76.090486,-92.946589,107.443851,67.435400,-34.648964
1,-78.137724,6.451642,-204.137650,-44.446070,-3.920353,-117.096080,-109.050666,73.259073,8.050844,117.335820
2,53.717717,-58.558068,-75.300158,-101.094662,-5.784238,260.540701,263.875620,57.646625,83.578560,-214.945546
3,198.947736,161.522812,-118.499216,75.688573,-58.280551,44.819885,70.936454,133.999289,-46.094280,-39.318880
4,-14.809407,-34.137635,155.576320,-77.981783,54.286133,52.308045,-70.644105,48.272009,25.038397,37.830688
...,...,...,...,...,...,...,...,...,...,...
1105,-200.781175,160.682101,-16.841627,36.628485,13.685852,-106.496468,-72.153336,-11.018913,-158.091697,130.215965
1106,-86.888182,49.720129,51.324219,92.176180,-75.210718,-161.194211,-75.754060,-26.477631,-123.918766,48.511527
1107,-142.004406,-147.544609,-144.906280,-128.710515,61.983856,134.302059,-22.573077,100.302250,93.052168,-42.731504
1108,-4.938151,114.727896,69.596202,-32.863036,52.796462,50.363067,29.828807,-110.402050,73.625449,44.831752
