In [40]:
import pandas as pd
import numpy as np

# A list of numbers
nums = [20, 30, 40, 55]

# A list with mixed data types (integers and strings)
letters = ["a", "b", 33, "d"]

# A dictionary with keys and values
dictt = {"a": 33, "z": 11, "q": 49}

# Generating 9 random integers between 10 and 100 using NumPy
rannum = np.random.randint(10, 100, 9)

# Creating a Pandas Series with `nums` as data and `letters` as index
pandas_series = pd.Series(nums, letters)
print(pandas_series)

import warnings
warnings.filterwarnings("ignore")

a     20
b     30
33    40
d     55
dtype: int64


In [2]:
# Slicing the first two elements of the Pandas Series `pandas_series`
print(pandas_series[0:2])

a    20
b    30
dtype: int64


In [3]:
# Reversing the order of the elements in the Pandas Series `pandas_series`
print(pandas_series[::-1])

d     55
33    40
b     30
a     20
dtype: int64


In [4]:
# Print the number of dimensions of the Pandas Series 
# (it will be 1, because it's a 1-dimensional array)
print(pandas_series.ndim)

1


In [5]:
# Calculate and print the sum of the elements in the Series
print(pandas_series.sum())  # You can also use max(), min(), etc.

145


In [6]:
# Add the Series to itself (element-wise addition)
print(pandas_series + pandas_series)

a      40
b      60
33     80
d     110
dtype: int64


In [7]:
# Square each element of the Series
print(pandas_series ** 2)

a      400
b      900
33    1600
d     3025
dtype: int64


In [8]:
# Check if each element in the Series is greater than or equal to 44
print(pandas_series >= 44)

a     False
b     False
33    False
d      True
dtype: bool


In [9]:
# Creating a Pandas Series with scalar value 5 and custom indices [0,1,2,3]
pan_ser = pd.Series(5, [0, 1, 2, 3])
print(pan_ser)

0    5
1    5
2    5
3    5
dtype: int64


In [10]:
# Creating a Pandas Series from a dictionary (`dictt`), 
# where the keys are used as index and values as data
pan_dic = pd.Series(dictt)
print(pan_dic)

a    33
z    11
q    49
dtype: int64


In [11]:
# Creating a Pandas Series from random numbers generated by NumPy
randpan = pd.Series(rannum)
print(randpan)

0    56
1    31
2    61
3    30
4    63
5    76
6    14
7    89
8    97
dtype: int32


In [12]:
# Creating two Pandas Series representing the sales of Ford cars 
ford2023 = pd.Series([14, 23, 32, 19], ["model1", "model2", 
                                        "model2", "model3"])
ford2024 = pd.Series([24, 29, 12, 29], ["model1", "model2", 
                                        "model3", "model4"])

# Adding the two Series together 
#(it will add elements with matching index labels)
print(ford2023 + ford2024)

model1    38.0
model2    52.0
model2    61.0
model3    31.0
model4     NaN
dtype: float64


In [13]:
# Create three Pandas Series
s1 = pd.Series([21, 32, 34, 22])
s2 = pd.Series([23, 43, 22, 11])
s3 = pd.Series([13, 13, 32, 41])

# Create a dictionary where each key corresponds to a Series
data = dict(grapes=s1, toys=s2, snacks=s3)

# Create a DataFrame using the dictionary
df = pd.DataFrame(data)
print(df) 

   grapes  toys  snacks
0      21    23      13
1      32    43      13
2      34    22      32
3      22    11      41


In [14]:
# Create a simple DataFrame with a list of lists
df2 = pd.DataFrame([["Jack", 33], ["John", 23], 
                    ["Jacob", 83], ["Jerry", 93]])
print(df2)

       0   1
0   Jack  33
1   John  23
2  Jacob  83
3  Jerry  93


In [15]:
# Create a DataFrame using a list of lists, specify column names and index
dataaa = [[33, 22], [11, 44], [55, 88]]
df3 = pd.DataFrame(dataaa, columns=["Num1", "Num2"], 
                   index=[2, 3, 4], dtype=float)
print(df3)

   Num1  Num2
2  33.0  22.0
3  11.0  44.0
4  55.0  88.0


In [16]:
# Create a DataFrame using a dictionary, 
# where 'Names' and 'Grades' are the keys
dictt = {"Names": ["Jack", "John", "Jacob", "Jerry"],
         "Grades": [23, 34, 45, 55]}

In [17]:
# Create a DataFrame with a custom index using the dictionary
df4 = pd.DataFrame(dictt, index=["222", "333", "444", "555"])
print(df4)

     Names  Grades
222   Jack      23
333   John      34
444  Jacob      45
555  Jerry      55


In [18]:
from numpy.random import randn

# Create a DataFrame with random values from a normal distribution
# The DataFrame has 3 rows labeled 'A', 'B', 'C' and 3 columns 
df = pd.DataFrame(randn(3, 3), 
                  index=["A", "B", "C"],
                  columns=["Col1", "col2", "col3"])
print(df)

       Col1      col2      col3
A -0.958110 -0.501303  0.242829
B  0.849789 -0.351419 -0.315132
C  0.813042  1.976447 -0.350192


In [19]:
# Select the 'col2' column from the DataFrame
print(df["col2"])

A   -0.501303
B   -0.351419
C    1.976447
Name: col2, dtype: float64


In [20]:
# Select multiple columns ('col2' and 'col3') from the DataFrame
print(df[["col2", "col3"]])

       col2      col3
A -0.501303  0.242829
B -0.351419 -0.315132
C  1.976447 -0.350192


In [21]:
# Select all values in row 'A' using .loc (label-based indexing)
print(df.loc["A"])

Col1   -0.958110
col2   -0.501303
col3    0.242829
Name: A, dtype: float64


In [22]:
# Select a range of rows and columns using .loc (label-based indexing)
# Selecting rows 'A' to 'B' and columns 'Col1' to 'col2'
print(df.loc["A":"B", "Col1":"col2"])

       Col1      col2
A -0.958110 -0.501303
B  0.849789 -0.351419


In [23]:
# Select the second row using .iloc (integer-based indexing)
print(df.iloc[1])

Col1    0.849789
col2   -0.351419
col3   -0.315132
Name: B, dtype: float64


In [24]:
# Create a new column 'Col4' with random values and add it to the DataFrame
df["Col4"] = pd.Series(randn(3), index=["A", "B", "C"])

In [25]:
# Create another column 'Col5' that is the sum of 'Col1' and 'col3'
df["Col5"] = df["Col1"] + df["col3"]
print(df)

       Col1      col2      col3      Col4      Col5
A -0.958110 -0.501303  0.242829  1.378398 -0.715281
B  0.849789 -0.351419 -0.315132 -0.742886  0.534656
C  0.813042  1.976447 -0.350192 -0.503846  0.462850


In [26]:
# Drop the 'col2' column from the DataFrame
# axis=1 means dropping columns (axis=0 would drop rows)
# By default, the original DataFrame is not modified (inplace=False)
print(df.drop("col2", axis=1))

# To update the original DataFrame permanently, you can use inplace=True
# df.drop("col2", axis=1, inplace=True)

       Col1      col3      Col4      Col5
A -0.958110  0.242829  1.378398 -0.715281
B  0.849789 -0.315132 -0.742886  0.534656
C  0.813042 -0.350192 -0.503846  0.462850


In [27]:
# Create a dictionary to represent employees' information
employees = {
    "employees": ["john", "jake", "lindy", 
                  "charlie", "madison", "vivienne"],
    "Department": ["HR", "IT", "HR", "IT", "BT", "BT"],
    "age": [33, 22, 44, 33, 44, 44],
    "salary": [4000, 3400, 2900, 4900, 2200, 4700]
}

# Convert the dictionary into a pandas DataFrame
df = pd.DataFrame(employees)
print(df)

  employees Department  age  salary
0      john         HR   33    4000
1      jake         IT   22    3400
2     lindy         HR   44    2900
3   charlie         IT   33    4900
4   madison         BT   44    2200
5  vivienne         BT   44    4700


In [28]:
# Calculate the total salary of all employees
summ = df["salary"].sum()
print(summ)

22100


In [33]:
# Group the DataFrame by the "Departman" column
Depar = df.groupby(["Department"]).groups
print(Depar)

{'BT': [4, 5], 'HR': [0, 2], 'IT': [1, 3]}


In [34]:
# Group the DataFrame by both "Departman" and "age"
# and get the dictionary of their groups (keys are tuples)
depandage = df.groupby(["Department", "age"]).groups
print(depandage)

{('BT', 44): [4, 5], ('HR', 33): [0], ('HR', 44): [2], ('IT', 22): [1], ('IT', 33): [3]}


In [35]:
# Iterate over the grouped DataFrame by "Department"
for name, group in Depar:
    print(name)  # Print department name
    print(group)  # Print the group (employees in that department)

B
T
H
R
I
T


In [36]:
# Get and print the group of employees in the "HR" department
BT = df.groupby("Department").get_group("HR")
print(BT)

  employees Department  age  salary
0      john         HR   33    4000
2     lindy         HR   44    2900


In [41]:
# Perform various aggregation operations on grouped data by "Departman"
# 1. Sum of the grouped columns
SUM = df.groupby("Department").sum()
print(SUM)

            age  salary
Department             
BT           88    6900
HR           77    6900
IT           55    8300


In [42]:
# 2. Mean of the grouped columns
mean = df.groupby("Department").mean()
print(mean)

             age  salary
Department              
BT          44.0  3450.0
HR          38.5  3450.0
IT          27.5  4150.0


In [43]:
# 3. Sum of "Maas" for each department
meanOfsalary = df.groupby("Department")["salary"].sum()
print(meanOfsalary)

Department
BT    6900
HR    6900
IT    8300
Name: salary, dtype: int64


In [46]:
# 4. Count of employees in each "age" group
countt = df.groupby("age")["employees"].count()
print(countt)

age
22    1
33    2
44    3
Name: employees, dtype: int64


In [47]:
# 5. Maximum salary in the "BT" department
Maxx = df.groupby("Department")["salary"].max()["BT"]
print(Maxx)

4700


In [49]:
# 6. Aggregating multiple functions (mean, sum, max, min) for the "salary"
ALL = df.groupby("Department")["salary"].agg([np.mean, np.sum, 
                                              np.max, np.min])
print(ALL)

              mean   sum  amax  amin
Department                          
BT          3450.0  6900  4700  2200
HR          3450.0  6900  4000  2900
IT          4150.0  8300  4900  3400


In [50]:
# Create a random 5x3 integer array with values between 10 and 100
data = np.random.randint(10, 100, 15).reshape(5, 3)

# Create a DataFrame from 'data' with custom indices and column names
df = pd.DataFrame(data, index=["a", "c", "e", "f", "h"], 
                  columns=["Col1", "Col2", "Col3"])
print(df)

   Col1  Col2  Col3
a    36    54    66
c    77    80    41
e    62    45    32
f    37    80    61
h    50    75    85


In [51]:
# Reindex the DataFrame to include missing rows "b", "d", "g"
df = df.reindex(["a", "b", "c", "d", "e", "f", "g", "h"])
print(df)  # Print DataFrame after reindexing (NaN will appear for new rows

   Col1  Col2  Col3
a  36.0  54.0  66.0
b   NaN   NaN   NaN
c  77.0  80.0  41.0
d   NaN   NaN   NaN
e  62.0  45.0  32.0
f  37.0  80.0  61.0
g   NaN   NaN   NaN
h  50.0  75.0  85.0


In [52]:
# Drop the "Col1" column from the DataFrame
df1 = df.drop("Col1", axis=1)

# Drop rows "a" and "d" from the DataFrame
df2 = df.drop(["a", "d"], axis=0)

print(df1)  # Print DataFrame without "Col1"
print(df2)  # Print DataFrame without rows "a" and "d"

   Col2  Col3
a  54.0  66.0
b   NaN   NaN
c  80.0  41.0
d   NaN   NaN
e  45.0  32.0
f  80.0  61.0
g   NaN   NaN
h  75.0  85.0
   Col1  Col2  Col3
b   NaN   NaN   NaN
c  77.0  80.0  41.0
e  62.0  45.0  32.0
f  37.0  80.0  61.0
g   NaN   NaN   NaN
h  50.0  75.0  85.0


In [53]:
# Check for missing values (NaN) in the DataFrame 
Nan = df.isnull() # (returns a boolean DataFrame)
print(Nan)  # Print True for NaN values and False otherwise

    Col1   Col2   Col3
a  False  False  False
b   True   True   True
c  False  False  False
d   True   True   True
e  False  False  False
f  False  False  False
g   True   True   True
h  False  False  False


In [54]:
# Count the number of missing values in each column
numNan = df.isnull().sum()
print(numNan)  # Print the count of missing values per column

Col1    3
Col2    3
Col3    3
dtype: int64


In [55]:
# Create a new column "Col4" with some missing values (np.nan)
newcoll = [np.nan, 30, np.nan, 49, np.nan, 22, 11, np.nan]
df["Col4"] = newcoll
print(df)  # Print DataFrame with new "Col4"

   Col1  Col2  Col3  Col4
a  36.0  54.0  66.0   NaN
b   NaN   NaN   NaN  30.0
c  77.0  80.0  41.0   NaN
d   NaN   NaN   NaN  49.0
e  62.0  45.0  32.0   NaN
f  37.0  80.0  61.0  22.0
g   NaN   NaN   NaN  11.0
h  50.0  75.0  85.0   NaN


In [56]:
# Filter and show rows where "Col3" has NaN values
nulls = df[df["Col3"].isnull()]
print(nulls)  # Print rows where "Col3" is NaN

   Col1  Col2  Col3  Col4
b   NaN   NaN   NaN  30.0
d   NaN   NaN   NaN  49.0
g   NaN   NaN   NaN  11.0


In [57]:
# Drop rows with any NaN values
dropnans = df.dropna()
print(dropnans)  # Print DataFrame after dropping all rows with NaN

   Col1  Col2  Col3  Col4
f  37.0  80.0  61.0  22.0


In [58]:
# Drop rows where "Col1" and "Col4" are both NaN (how="all")
subset1 = df.dropna(subset=["Col1", "Col4"], how="all")
print(subset1)  
# Print DataFrame where rows with both NaN in "Col1" and "Col4" are dropped

   Col1  Col2  Col3  Col4
a  36.0  54.0  66.0   NaN
b   NaN   NaN   NaN  30.0
c  77.0  80.0  41.0   NaN
d   NaN   NaN   NaN  49.0
e  62.0  45.0  32.0   NaN
f  37.0  80.0  61.0  22.0
g   NaN   NaN   NaN  11.0
h  50.0  75.0  85.0   NaN


In [59]:
# Drop rows where either "Col1" or "Col4" is NaN (how="any")
subset2 = df.dropna(subset=["Col1", "Col4"], how="any")
print(subset2)  
# Print DataFrame where rows with NaN in either "Col1" or "Col4" are dropped

   Col1  Col2  Col3  Col4
f  37.0  80.0  61.0  22.0


In [60]:
# Drop rows that have fewer than 3 non-NaN values
thresh = df.dropna(thresh=3)
print(thresh)  # Print DataFrame after applying threshold

   Col1  Col2  Col3  Col4
a  36.0  54.0  66.0   NaN
c  77.0  80.0  41.0   NaN
e  62.0  45.0  32.0   NaN
f  37.0  80.0  61.0  22.0
h  50.0  75.0  85.0   NaN


In [61]:
# Fill NaN values with "No input"
fillna = df.fillna(value="No input")
print(fillna)  # Print DataFrame with NaN replaced by "No input"

       Col1      Col2      Col3      Col4
a      36.0      54.0      66.0  No input
b  No input  No input  No input      30.0
c      77.0      80.0      41.0  No input
d  No input  No input  No input      49.0
e      62.0      45.0      32.0  No input
f      37.0      80.0      61.0      22.0
g  No input  No input  No input      11.0
h      50.0      75.0      85.0  No input


In [62]:
# Fill NaN values with 0
fillna2 = df.fillna(value=0)
print(fillna2)  # Print DataFrame with NaN replaced by 0

   Col1  Col2  Col3  Col4
a  36.0  54.0  66.0   0.0
b   0.0   0.0   0.0  30.0
c  77.0  80.0  41.0   0.0
d   0.0   0.0   0.0  49.0
e  62.0  45.0  32.0   0.0
f  37.0  80.0  61.0  22.0
g   0.0   0.0   0.0  11.0
h  50.0  75.0  85.0   0.0


In [63]:
# Define a function that calculates the mean of non-NaN 
# values in the DataFrame
def mean_fill(df):
    total = df.sum().sum()  # Total sum of all elements
    size = df.size - df.isnull().sum().sum()  # Count of non-NaN elements
    return total / size  # Return the mean

# Fill NaN values with the calculated mean of the DataFrame
filled_w_mean = df.fillna(value=mean_fill(df))
print(filled_w_mean)  # Print DataFrame with NaN replaced by the mean value

        Col1       Col2       Col3       Col4
a  36.000000  54.000000  66.000000  52.263158
b  52.263158  52.263158  52.263158  30.000000
c  77.000000  80.000000  41.000000  52.263158
d  52.263158  52.263158  52.263158  49.000000
e  62.000000  45.000000  32.000000  52.263158
f  37.000000  80.000000  61.000000  22.000000
g  52.263158  52.263158  52.263158  11.000000
h  50.000000  75.000000  85.000000  52.263158


In [64]:
# Generate a 15x5 matrix of random integers between 10 and 100
data = np.random.randint(10, 100, 75).reshape(15, 5)

# Create a DataFrame from 'data' with columns "Col1", "Col2", etc.
df = pd.DataFrame(data, columns=["Col1", "Col2", "Col3", "Col4", "Col5"])

# Print the entire DataFrame
print(df)

    Col1  Col2  Col3  Col4  Col5
0     33    24    54    11    30
1     39    75    36    64    80
2     49    44    94    40    19
3     98    34    60    50    56
4     75    27    24    21    97
5     51    92    17    14    25
6     56    35    63    71    47
7     93    36    67    14    13
8     20    50    62    75    59
9     81    58    43    79    80
10    10    61    42    43    32
11    57    72    41    18    55
12    52    94    48    88    47
13    38    81    66    30    43
14    28    24    69    73    80


In [65]:
# Print the first 7 rows of the DataFrame
print(df.head(7))

   Col1  Col2  Col3  Col4  Col5
0    33    24    54    11    30
1    39    75    36    64    80
2    49    44    94    40    19
3    98    34    60    50    56
4    75    27    24    21    97
5    51    92    17    14    25
6    56    35    63    71    47


In [66]:
# Print the last 4 rows of the DataFrame
print(df.tail(4))

    Col1  Col2  Col3  Col4  Col5
11    57    72    41    18    55
12    52    94    48    88    47
13    38    81    66    30    43
14    28    24    69    73    80


In [67]:
# Print the first 3 rows of "Col2"
print(df["Col2"].head(3))

0    24
1    75
2    44
Name: Col2, dtype: int32


In [68]:
# Select rows 5 to 12 (inclusive), then select "Col1" and "Col2" 
# with last 5 rows
print(df[5:13][["Col1", "Col2"]].tail())

    Col1  Col2
8     20    50
9     81    58
10    10    61
11    57    72
12    52    94


In [69]:
# Print DataFrame where values greater than 50 are displayed, 
# others will show as NaN
print(df[df > 50])

    Col1  Col2  Col3  Col4  Col5
0    NaN   NaN  54.0   NaN   NaN
1    NaN  75.0   NaN  64.0  80.0
2    NaN   NaN  94.0   NaN   NaN
3   98.0   NaN  60.0   NaN  56.0
4   75.0   NaN   NaN   NaN  97.0
5   51.0  92.0   NaN   NaN   NaN
6   56.0   NaN  63.0  71.0   NaN
7   93.0   NaN  67.0   NaN   NaN
8    NaN   NaN  62.0  75.0  59.0
9   81.0  58.0   NaN  79.0  80.0
10   NaN  61.0   NaN   NaN   NaN
11  57.0  72.0   NaN   NaN  55.0
12  52.0  94.0   NaN  88.0   NaN
13   NaN  81.0  66.0   NaN   NaN
14   NaN   NaN  69.0  73.0  80.0


In [70]:
# Print rows where values in "Col1" are greater than 50
print(df[df["Col1"] > 50])  # !!!

    Col1  Col2  Col3  Col4  Col5
3     98    34    60    50    56
4     75    27    24    21    97
5     51    92    17    14    25
6     56    35    63    71    47
7     93    36    67    14    13
9     81    58    43    79    80
11    57    72    41    18    55
12    52    94    48    88    47


In [71]:
# Print "Col1", "Col2", and "Col3" where "Col1" is greater than 50
print(df[df["Col1"] > 50][["Col1", "Col2", "Col3"]])  # !!!

    Col1  Col2  Col3
3     98    34    60
4     75    27    24
5     51    92    17
6     56    35    63
7     93    36    67
9     81    58    43
11    57    72    41
12    52    94    48


In [72]:
# Print rows where "Col1" is greater than 50 AND "Col2" is less than 76
print(df[(df["Col1"] > 50) & (df["Col2"] < 76)])  # !!!

    Col1  Col2  Col3  Col4  Col5
3     98    34    60    50    56
4     75    27    24    21    97
6     56    35    63    71    47
7     93    36    67    14    13
9     81    58    43    79    80
11    57    72    41    18    55


In [73]:
# Print rows where "Col1" is greater than 50 OR "Col3" is an odd number
print(df[(df["Col1"] > 50) | (df["Col3"] % 2 == 1)])  # !!!

    Col1  Col2  Col3  Col4  Col5
3     98    34    60    50    56
4     75    27    24    21    97
5     51    92    17    14    25
6     56    35    63    71    47
7     93    36    67    14    13
9     81    58    43    79    80
11    57    72    41    18    55
12    52    94    48    88    47
14    28    24    69    73    80


In [74]:
#Query using a condition: "Col1" greater than 60 AND "Col3" greater than 80
#Then print "Col1", "Col2", and "Col3" for these rows
print(df.query("Col1 > 60 & Col3 > 80")[["Col1", "Col2", "Col3"]])

Empty DataFrame
Columns: [Col1, Col2, Col3]
Index: []
