In [1]:
import numpy as np
import pandas as pd

# NumPy vs Pandas: A Side-by-Side Comparison

| **Feature**             | **NumPy**                                      | **Pandas**                                      |
|------------------------|------------------------------------------------|-------------------------------------------------|
| **Core Purpose**        | Numerical computing, handling large arrays     | Data manipulation and analysis, handling structured data |
| **Data Structures**     | `ndarray` (n-dimensional array)                | `Series` (1D), `DataFrame` (2D), `Panel` (3D, deprecated) |
| **Data Type**           | Homogeneous (all elements are the same type)   | Heterogeneous (can hold different data types in the same table) |
| **Indexing**            | Positional (numerical index)                   | Label-based (row and column names) and positional |
| **Operations**          | Fast numerical computations                    | Powerful data wrangling (merge, reshape, filter, group) |
| **Performance**         | High performance for numerical data            | Slightly slower than NumPy due to handling labels and mixed data types |
| **Use Case**            | Scientific computing, machine learning         | Data cleaning, transformation, and analysis |
| **Mathematical Functions** | Linear algebra, random number generation     | Limited, but can use NumPy under the hood for math operations |
| **File Handling**       | Limited (manual loading and saving required)   | Built-in methods for reading and writing from/to CSV, Excel, SQL, etc. |
| **Learning Curve**      | Easier for numerical tasks, requires good understanding of arrays | More intuitive for handling tabular data, especially for data analysts |
| **Dependencies**        | Base library for numerical operations in Python | Built on top of NumPy, relies on NumPy for many operations |
| **Integration**         | Often used in machine learning libraries like TensorFlow, PyTorch | Widely used in data science workflows with integration to other libraries like Matplotlib, Scikit-learn |

**Conclusion**:  
- **NumPy** is best for high-performance numerical computations and large datasets where mathematical operations are the primary focus.
- **Pandas** excels at data manipulation, especially when working with structured, tabular datasets with rows and columns.


In [2]:
labels = ['a','b','c']

In [3]:
mylist=[10,20,30]
arr=np.array(mylist)

In [4]:
d= {'a':10,'b':20,'c':30}

### The pandas.Series function creates a one-dimensional labeled array capable of holding any data type (integers, strings, floats, etc.). 
### It's similar to a column in a DataFrame but more flexible, as it can hold heterogeneous data. Each element in the Series is assigned an index.
    pandas.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)


In [5]:
pd.Series(data=mylist)

0    10
1    20
2    30
dtype: int64

In [6]:
# Named index is one of the feature of Pandas which is not possible in numpy
# Pandas series = Numpy array with named index
pd.Series(data=arr,index=labels)

a    10
b    20
c    30
dtype: int32

In [7]:
#Pandas can hold hetrogenous data in it
pd.Series(data=['a',10,'c',20.1,30])

0       a
1      10
2       c
3    20.1
4      30
dtype: object

In [8]:
ser1 = pd.Series(data=['dollar','rupee','euro'],index=['USA','India','Europe'])
print(ser1)

USA       dollar
India      rupee
Europe      euro
dtype: object


In [9]:
ser1['USA'] # using named index

'dollar'

## Adding two series
    we can add two series in pandas and the pandas library will automaticaly add two series based on the index
    NaN = nothing [this usually comes when index is present only in one series and not the other]

In [10]:
# Creating two Series
s1 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([5, 15, 25, 35], index=['a', 'b', 'c', 'e'])
print(s1)
print(s2)
# Adding the two Series
result = s1 + s2
print(result)

a    10
b    20
c    30
d    40
dtype: int64
a     5
b    15
c    25
e    35
dtype: int64
a    15.0
b    35.0
c    55.0
d     NaN
e     NaN
dtype: float64


## Handle NaN values
    we can handle NaN values that might occur after performing any mathmatical operation in series using
    "fillna()" which will 

In [11]:
s1 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
s3 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'e', 'f'])
print(s1+s3)
# Adding the two Series and filling NaN with -1
result_with_fillna = (s1 + s3).fillna(-1)
print(result_with_fillna)

a    11.0
b    22.0
c     NaN
d     NaN
e     NaN
f     NaN
dtype: float64
a    11.0
b    22.0
c    -1.0
d    -1.0
e    -1.0
f    -1.0
dtype: float64


# Dataframes
In pandas, a DataFrame is a two-dimensional, tabular data structure similar to a table in a relational database or an Excel spreadsheet. It consists of rows and columns, where each column can be of a different data type (e.g., integers, floats, strings, etc.)

### Dataframe is simply multiple series that shares the same index

In [12]:
# Creating a DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'Score': [85.5, 90.0, 88.0]
}

df = pd.DataFrame(data)

# Displaying the DataFrame
print(df)

      Name  Age  Score
0    Alice   25   85.5
1      Bob   30   90.0
2  Charlie   35   88.0


In [13]:
# Creating the DataFrame with a named index
df = pd.DataFrame(data, index=['ID1', 'ID2', 'ID3'])

# Display the DataFrame
print(df)

        Name  Age  Score
ID1    Alice   25   85.5
ID2      Bob   30   90.0
ID3  Charlie   35   88.0


In [14]:
np.random.seed(101)
rand_mat = np.random.randn(5,4)
print(rand_mat)

[[ 2.70684984  0.62813271  0.90796945  0.50382575]
 [ 0.65111795 -0.31931804 -0.84807698  0.60596535]
 [-2.01816824  0.74012206  0.52881349 -0.58900053]
 [ 0.18869531 -0.75887206 -0.93323722  0.95505651]
 [ 0.19079432  1.97875732  2.60596728  0.68350889]]


In [15]:
df=pd.DataFrame(data=rand_mat)
df

Unnamed: 0,0,1,2,3
0,2.70685,0.628133,0.907969,0.503826
1,0.651118,-0.319318,-0.848077,0.605965
2,-2.018168,0.740122,0.528813,-0.589001
3,0.188695,-0.758872,-0.933237,0.955057
4,0.190794,1.978757,2.605967,0.683509


In [16]:
df=pd.DataFrame(data=rand_mat,index='A B C D E'.split(),columns='W X Y Z'.split())
# split will create a list by spiting the string
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [17]:
df['W'] # W column is extracted = one column = one series -> Meaning output will be series type

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [18]:
temp=df[['W','Y']] # since we are extracting 2 series -> The output will be a dataframe [datafram = collection of series]
temp

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077
C,-2.018168,0.528813
D,0.188695,-0.933237
E,0.190794,2.605967


In [19]:
# Adding new series / column to the data frame
df['New'] = "! @ # $ %".split()
df['sum'] = df['W']+df['Y']
df

Unnamed: 0,W,X,Y,Z,New,sum
A,2.70685,0.628133,0.907969,0.503826,!,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,@,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,#,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,$,-0.744542
E,0.190794,1.978757,2.605967,0.683509,%,2.796762


## The pandas.DataFrame.drop()
    method is used to remove rows or columns from a DataFrame. You can use it to drop specific rows or columns based on      their labels. Here’s a detailed explanation of how to use it:

    DataFrame.drop(labels=None, axis=0, index=None, columns=None, inplace=False, errors='raise')
    axis=0 to drop the row
    axis=1 to drop the column
    inplace=True  [will drop the row/column from the original dataframe]


In [20]:
# Remove a column
print(df)
df.drop('sum',axis=1) # choose axis=1 to delete column
print(df) # drop() dont change the original datafram -> so you have to manually save it the dataframe

          W         X         Y         Z New       sum
A  2.706850  0.628133  0.907969  0.503826   !  3.614819
B  0.651118 -0.319318 -0.848077  0.605965   @ -0.196959
C -2.018168  0.740122  0.528813 -0.589001   # -1.489355
D  0.188695 -0.758872 -0.933237  0.955057   $ -0.744542
E  0.190794  1.978757  2.605967  0.683509   %  2.796762
          W         X         Y         Z New       sum
A  2.706850  0.628133  0.907969  0.503826   !  3.614819
B  0.651118 -0.319318 -0.848077  0.605965   @ -0.196959
C -2.018168  0.740122  0.528813 -0.589001   # -1.489355
D  0.188695 -0.758872 -0.933237  0.955057   $ -0.744542
E  0.190794  1.978757  2.605967  0.683509   %  2.796762


In [21]:
df= df.drop('sum',axis=1)
print(df)

          W         X         Y         Z New
A  2.706850  0.628133  0.907969  0.503826   !
B  0.651118 -0.319318 -0.848077  0.605965   @
C -2.018168  0.740122  0.528813 -0.589001   #
D  0.188695 -0.758872 -0.933237  0.955057   $
E  0.190794  1.978757  2.605967  0.683509   %


In [22]:
df.drop('New',axis=1,inplace=True) # inplace=True will reflect the changes to Dataframe
print(df)

          W         X         Y         Z
A  2.706850  0.628133  0.907969  0.503826
B  0.651118 -0.319318 -0.848077  0.605965
C -2.018168  0.740122  0.528813 -0.589001
D  0.188695 -0.758872 -0.933237  0.955057
E  0.190794  1.978757  2.605967  0.683509


In [23]:
df.drop('D',axis=0) # deleting the rows

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
E,0.190794,1.978757,2.605967,0.683509


In [24]:
df.loc['A'] #select row

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [25]:
df.iloc[0] #index 0= first row  [in our case first row in A]

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [26]:
df.loc[['A','C']]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
C,-2.018168,0.740122,0.528813,-0.589001


In [27]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [28]:
df.loc[['A','C']][['W','Y']] # df.loc[row][column]

Unnamed: 0,W,Y
A,2.70685,0.907969
C,-2.018168,0.528813


In [29]:
df.loc['A']['Y']

0.9079694464765431

## Conditional Selection

In [30]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [31]:
df >0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [32]:
temp=df.copy() # use copy function other both temp and df will be linked together
temp[df>0]=1 # replace all number greater than 0 to 1
temp[df<=0]=-1 
temp

Unnamed: 0,W,X,Y,Z
A,1.0,1.0,1.0,1.0
B,1.0,-1.0,-1.0,1.0
C,-1.0,1.0,1.0,-1.0
D,1.0,-1.0,-1.0,1.0
E,1.0,1.0,1.0,1.0


In [33]:
# you can use codnitional filtering in columns
df['W']>0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [34]:
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [35]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [36]:
# Merging Conditions
cond1=df['W']>0
cond2=df['Y']>1
df [cond1 & cond2] # and = & and or = |(pipe operator)

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


## To reset index
The pandas.DataFrame.reset_index() method is used to reset the index of a DataFrame. This method can be useful when you have modified the DataFrame and want to reset the index back to its default integer index. It can also be used to turn the existing index into a column
```bash
DataFrame.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill='')
```

In [37]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [38]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


## Set a column to be the index

In [39]:
df['state']="4 5 6 7 9".split()
df

Unnamed: 0,W,X,Y,Z,state
A,2.70685,0.628133,0.907969,0.503826,4
B,0.651118,-0.319318,-0.848077,0.605965,5
C,-2.018168,0.740122,0.528813,-0.589001,6
D,0.188695,-0.758872,-0.933237,0.955057,7
E,0.190794,1.978757,2.605967,0.683509,9


In [40]:
temp=df.copy()
temp.set_index('state')
#Note: if you set_index you will loose the original index
#      if you want to avoid this then first reset_index and then set index

Unnamed: 0_level_0,W,X,Y,Z
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,2.70685,0.628133,0.907969,0.503826
5,0.651118,-0.319318,-0.848077,0.605965
6,-2.018168,0.740122,0.528813,-0.589001
7,0.188695,-0.758872,-0.933237,0.955057
9,0.190794,1.978757,2.605967,0.683509


In [41]:
# reset_index -> Set_index
print(temp)
temp.reset_index(inplace=True)
print(temp)

temp.set_index('state',inplace=True) # this way you wont loose the previous index column
print(temp)

          W         X         Y         Z state
A  2.706850  0.628133  0.907969  0.503826     4
B  0.651118 -0.319318 -0.848077  0.605965     5
C -2.018168  0.740122  0.528813 -0.589001     6
D  0.188695 -0.758872 -0.933237  0.955057     7
E  0.190794  1.978757  2.605967  0.683509     9
  index         W         X         Y         Z state
0     A  2.706850  0.628133  0.907969  0.503826     4
1     B  0.651118 -0.319318 -0.848077  0.605965     5
2     C -2.018168  0.740122  0.528813 -0.589001     6
3     D  0.188695 -0.758872 -0.933237  0.955057     7
4     E  0.190794  1.978757  2.605967  0.683509     9
      index         W         X         Y         Z
state                                              
4         A  2.706850  0.628133  0.907969  0.503826
5         B  0.651118 -0.319318 -0.848077  0.605965
6         C -2.018168  0.740122  0.528813 -0.589001
7         D  0.188695 -0.758872 -0.933237  0.955057
9         E  0.190794  1.978757  2.605967  0.683509


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, A to E
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   W       5 non-null      float64
 1   X       5 non-null      float64
 2   Y       5 non-null      float64
 3   Z       5 non-null      float64
 4   state   5 non-null      object 
dtypes: float64(4), object(1)
memory usage: 412.0+ bytes


In [43]:
df.dtypes # data type of each columns

W        float64
X        float64
Y        float64
Z        float64
state     object
dtype: object

In [44]:
df.describe()

Unnamed: 0,W,X,Y,Z
count,5.0,5.0,5.0,5.0
mean,0.343858,0.453764,0.452287,0.431871
std,1.681131,1.061385,1.454516,0.594708
min,-2.018168,-0.758872,-0.933237,-0.589001
25%,0.188695,-0.319318,-0.848077,0.503826
50%,0.190794,0.628133,0.528813,0.605965
75%,0.651118,0.740122,0.907969,0.683509
max,2.70685,1.978757,2.605967,0.955057
