In [1]:
import pandas as pd

## **Series** (1D Data)

In [2]:
# series for 1 dimensional data
s = pd.Series([354, 567, 56, 342, 4, 68423])
s

0      354
1      567
2       56
3      342
4        4
5    68423
dtype: int64

In [3]:
print(s.index)
print(s.values)
print(s.dtype)

RangeIndex(start=0, stop=6, step=1)
[  354   567    56   342     4 68423]
int64


In [4]:
s1 = pd.Series([3, 4, 32, 42, 344, 242, 5], index=["a", "b", "c", "d", "e", "f", "g"])
# custom indexes are called labels
s1

a      3
b      4
c     32
d     42
e    344
f    242
g      5
dtype: int64

In [5]:
# slicing
print(s[2], "\n")
print(s[3:5], "\n")
print(s1["e"])
print(s1["d":"f"])  # last element is included when we access element using label

56 

3    342
4      4
dtype: int64 

344
d     42
e    344
f    242
dtype: int64


In [6]:
# returns boolean value
s1 > 50

a    False
b    False
c    False
d    False
e     True
f     True
g    False
dtype: bool

In [7]:
print(s1.shape)
print(s1.size)

(7,)
7


In [8]:
# checking type of data
type(s1)

pandas.core.series.Series

## **Creating Dataframe** (2D data)

In [9]:
import pandas as pd

result = {
    "roll": [11, 22, 33, 44, 55, 66, 77],
    "name": ["Ali", "Sameer", "Subhan", "Faisal", "Asad", "Umair", "Zohaib"],
    "python": [55, 76, 46, 99, 88, 12, 67],
    "numpy": [25, 65, 47, 79, 80, 32, 17],
}
dataframe = pd.DataFrame(result)
print(dataframe)

   roll    name  python  numpy
0    11     Ali      55     25
1    22  Sameer      76     65
2    33  Subhan      46     47
3    44  Faisal      99     79
4    55    Asad      88     80
5    66   Umair      12     32
6    77  Zohaib      67     17


### **Reading csv and excel files**


In [10]:
df1 = pd.read_csv("files/Salary_dataset.csv")
df1

Unnamed: 0.1,Unnamed: 0,YearsExperience,Salary
0,0,1.2,39344.0
1,1,1.4,46206.0
2,2,1.6,37732.0
3,3,2.1,43526.0
4,4,2.3,39892.0
5,5,3.0,56643.0
6,6,3.1,60151.0
7,7,3.3,54446.0
8,8,3.3,64446.0
9,9,3.8,57190.0


In [11]:
# Displays a summary of the DataFrame. Shows column names, non-null values, and data types.
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       30 non-null     int64  
 1   YearsExperience  30 non-null     float64
 2   Salary           30 non-null     float64
dtypes: float64(2), int64(1)
memory usage: 852.0 bytes


In [12]:
# The describe() function is used to generate summary statistics of a Pandas DataFrame or Series.
df1.describe()

Unnamed: 0.1,Unnamed: 0,YearsExperience,Salary
count,30.0,30.0,30.0
mean,14.5,5.413333,76004.0
std,8.803408,2.837888,27414.429785
min,0.0,1.2,37732.0
25%,7.25,3.3,56721.75
50%,14.5,4.8,65238.0
75%,21.75,7.8,100545.75
max,29.0,10.6,122392.0


In [13]:
df2 = pd.read_excel("files/employee.xlsx")
print(df2.tail()) # for last five rows or last n rows
type(df2)

    Serial  Emp_ID Designation  Department   Age    Salary
13      14    1098     Officer          IT  25.0  100000.0
14      15    2912     Officer          IT  31.0   90000.0
15      16    1222     Officer         NaN  25.0       NaN
16      17    2156    Engineer  Production  45.0   89000.0
17      18    2123  Office Boy    Accounts  27.0   45000.0


pandas.core.frame.DataFrame

## **Uni-variate Data Analysis**
- Focusing on a single variable at a time to describe its characteristics and patterns. It doesn't explore relationships between variables, but rather summarizes the distribution of a single variable using measures like central tendency (mean, median, mode),

In [None]:
# reading single column (series)
designation = df2["Designation"]
print(designation.head())
print(type(designation))

0       Manager
1       Officer
2       Officer
3       Manager
4    Office Boy
Name: Designation, dtype: object
<class 'pandas.core.series.Series'>


In [None]:
# accessing multiple columns (dataframe)
subset = df2[["Emp_ID", "Department"]]
subset.tail(10)

Unnamed: 0,Emp_ID,Department
8,1234,IT
9,2156,Production
10,6543,Production
11,3214,Account
12,2156,Production
13,1098,IT
14,2912,IT
15,1222,
16,2156,Production
17,2123,Accounts


In [16]:
employee_data = pd.read_excel("files/employee.xlsx")
print(employee_data)

    Serial  Emp_ID Designation  Department   Age    Salary
0        1    1101     Manager    Accounts  50.0  200000.0
1        2    1107     Officer          IT  30.0   80000.0
2        3    1203     Officer          HR  28.0       NaN
3        4    1005     Manager          HR  45.0  120000.0
4        5    2123  Office Boy    Accounts  27.0   45000.0
5        6    2451  Accountant         NaN  34.0  100000.0
6        7    1111  Accountant    Accounts   NaN  110000.0
7        8    1001     Officer          IT  25.0   75000.0
8        9    1234     Manager          IT  23.0       NaN
9       10    2156    Engineer  Production  45.0   89000.0
10      11    6543     Officer  Production  29.0  100000.0
11      12    3214  Accountant     Account  49.0  123000.0
12      13    2156    Engineer  Production  45.0   89000.0
13      14    1098     Officer          IT  25.0  100000.0
14      15    2912     Officer          IT  31.0   90000.0
15      16    1222     Officer         NaN  25.0       N

In [17]:
# axis is reverse for some methods like axis0=row and axis1=column
# drop method hides column temporarily (if inplace = False), by default, inplace = False
employee_data.drop("Department", axis=1)

Unnamed: 0,Serial,Emp_ID,Designation,Age,Salary
0,1,1101,Manager,50.0,200000.0
1,2,1107,Officer,30.0,80000.0
2,3,1203,Officer,28.0,
3,4,1005,Manager,45.0,120000.0
4,5,2123,Office Boy,27.0,45000.0
5,6,2451,Accountant,34.0,100000.0
6,7,1111,Accountant,,110000.0
7,8,1001,Officer,25.0,75000.0
8,9,1234,Manager,23.0,
9,10,2156,Engineer,45.0,89000.0


In [18]:
employee_data.head()  # department is still available because drop method hides it temporarily and returns new datafram without that column

Unnamed: 0,Serial,Emp_ID,Designation,Department,Age,Salary
0,1,1101,Manager,Accounts,50.0,200000.0
1,2,1107,Officer,IT,30.0,80000.0
2,3,1203,Officer,HR,28.0,
3,4,1005,Manager,HR,45.0,120000.0
4,5,2123,Office Boy,Accounts,27.0,45000.0


In [19]:
del employee_data["Department"]  # deletes column permanently, it is only used for columns
employee_data.head()

Unnamed: 0,Serial,Emp_ID,Designation,Age,Salary
0,1,1101,Manager,50.0,200000.0
1,2,1107,Officer,30.0,80000.0
2,3,1203,Officer,28.0,
3,4,1005,Manager,45.0,120000.0
4,5,2123,Office Boy,27.0,45000.0


In [20]:
# top 5 rows or df.head(n) top n rows 
employee_data.head()

Unnamed: 0,Serial,Emp_ID,Designation,Age,Salary
0,1,1101,Manager,50.0,200000.0
1,2,1107,Officer,30.0,80000.0
2,3,1203,Officer,28.0,
3,4,1005,Manager,45.0,120000.0
4,5,2123,Office Boy,27.0,45000.0


In [21]:
# last 5 rows or df.head(n) last n rows 
employee_data.tail()

Unnamed: 0,Serial,Emp_ID,Designation,Age,Salary
13,14,1098,Officer,25.0,100000.0
14,15,2912,Officer,31.0,90000.0
15,16,1222,Officer,25.0,
16,17,2156,Engineer,45.0,89000.0
17,18,2123,Office Boy,27.0,45000.0


In [22]:
# Returns a tuple with (number of rows, number of columns).
employee_data.shape 
# (rows, columns)

(18, 5)

In [23]:
# identify names of columns
employee_data.columns

Index(['Serial', 'Emp_ID', 'Designation', 'Age', 'Salary'], dtype='object')

In [24]:
employee_data.drop("Serial", axis=1, inplace=True)
employee_data.head()

Unnamed: 0,Emp_ID,Designation,Age,Salary
0,1101,Manager,50.0,200000.0
1,1107,Officer,30.0,80000.0
2,1203,Officer,28.0,
3,1005,Manager,45.0,120000.0
4,2123,Office Boy,27.0,45000.0


In [25]:
employee_data.rename(columns={"Emp_ID":"emp_id"}, inplace=True)
employee_data.head()

Unnamed: 0,emp_id,Designation,Age,Salary
0,1101,Manager,50.0,200000.0
1,1107,Officer,30.0,80000.0
2,1203,Officer,28.0,
3,1005,Manager,45.0,120000.0
4,2123,Office Boy,27.0,45000.0


## slicing


In [26]:
# iloc is used to slice data bases of indexes
employee_data.iloc[5:13, 0:6]
# 1st index -> row
# 2nd index -> column
# ending row is excluded

Unnamed: 0,emp_id,Designation,Age,Salary
5,2451,Accountant,34.0,100000.0
6,1111,Accountant,,110000.0
7,1001,Officer,25.0,75000.0
8,1234,Manager,23.0,
9,2156,Engineer,45.0,89000.0
10,6543,Officer,29.0,100000.0
11,3214,Accountant,49.0,123000.0
12,2156,Engineer,45.0,89000.0


In [27]:
# loc is used to slice data bases of labels
employee_data.loc[5:13,"Designation":"Salary"]
# ending row is inclusive in loc method

Unnamed: 0,Designation,Age,Salary
5,Accountant,34.0,100000.0
6,Accountant,,110000.0
7,Officer,25.0,75000.0
8,Manager,23.0,
9,Engineer,45.0,89000.0
10,Officer,29.0,100000.0
11,Accountant,49.0,123000.0
12,Engineer,45.0,89000.0
13,Officer,25.0,100000.0


In [28]:
# Returns 5 random rows from the dataset.
employee_data.sample(5)  # based on the length of sample you want. eg(data.sample(2))

Unnamed: 0,emp_id,Designation,Age,Salary
3,1005,Manager,45.0,120000.0
4,2123,Office Boy,27.0,45000.0
5,2451,Accountant,34.0,100000.0
15,1222,Officer,25.0,
1,1107,Officer,30.0,80000.0


In [29]:
# Provides statistical summary for numerical columns. Includes count, mean, std, min, max, and quartiles.
employee_data.describe()

Unnamed: 0,emp_id,Age,Salary
count,18.0,17.0,15.0
mean,1995.333333,34.294118,97000.0
std,1334.508369,9.719598,36284.589409
min,1001.0,23.0,45000.0
25%,1108.0,27.0,84500.0
50%,1678.5,30.0,90000.0
75%,2156.0,45.0,105000.0
max,6543.0,50.0,200000.0


In [30]:
# Returns a list of column names in the DataFrame.
employee_data.columns

Index(['emp_id', 'Designation', 'Age', 'Salary'], dtype='object')

In [31]:
# Returns the index (row labels) of the DataFrame.
employee_data.index

RangeIndex(start=0, stop=18, step=1)

In [32]:
# Returns the memory usage of each column in bytes.
employee_data.memory_usage()

Index          132
emp_id         144
Designation    144
Age            144
Salary         144
dtype: int64

In [None]:
# Returns a DataFrame of the same shape with True where values are missing (NaN), otherwise False.
employee_data.isnull()

Unnamed: 0,emp_id,Designation,Age,Salary
0,False,False,False,False
1,False,False,False,False
2,False,False,False,True
3,False,False,False,False
4,False,False,False,False
5,False,False,False,False
6,False,False,True,False
7,False,False,False,False
8,False,False,False,True
9,False,False,False,False


In [34]:
# Opposite of isnull(); returns True where values are not missing.
employee_data.notnull()

Unnamed: 0,emp_id,Designation,Age,Salary
0,True,True,True,True
1,True,True,True,True
2,True,True,True,False
3,True,True,True,True
4,True,True,True,True
5,True,True,True,True
6,True,True,False,True
7,True,True,True,True
8,True,True,True,False
9,True,True,True,True


In [35]:
# Returns the count of missing (null) values in each column.
employee_data.isnull().sum()

emp_id         0
Designation    0
Age            1
Salary         3
dtype: int64

In [36]:
# Returns the count of non-missing (valid) values in each column.
employee_data.notnull().sum()

emp_id         18
Designation    18
Age            17
Salary         15
dtype: int64

In [37]:
# accessing the 'Designation' column from the DataFrame.
employee_data.Designation

0        Manager
1        Officer
2        Officer
3        Manager
4     Office Boy
5     Accountant
6     Accountant
7        Officer
8        Manager
9       Engineer
10       Officer
11    Accountant
12      Engineer
13       Officer
14       Officer
15       Officer
16      Engineer
17    Office Boy
Name: Designation, dtype: object

In [47]:
# sum of null values in each column
employee_data.isna().sum()

emp_id         0
Designation    0
Age            1
Salary         3
dtype: int64

In [None]:
# filling null values
employee_data.fillna(
    {
        "Age": employee_data.Age.mean(),
        "Salary": employee_data.Salary.median(),
    },
    inplace=True,
)  # inplace true updates data permanently
employee_data.head()

Unnamed: 0,emp_id,Designation,Age,Salary
0,1101,Manager,50.0,200000.0
1,1107,Officer,30.0,80000.0
2,1203,Officer,28.0,90000.0
3,1005,Manager,45.0,120000.0
4,2123,Office Boy,27.0,45000.0


In [49]:
employee_data.isna().sum()

emp_id         0
Designation    0
Age            0
Salary         0
dtype: int64

In [50]:
# finding percentile
import numpy as np

np.percentile(employee_data.Age, 50)  # 50th percentile

np.float64(30.5)

In [51]:
# finding percentile using pandas
employee_data.Age.quantile(0.25)

np.float64(27.0)