In [2]:
# Q1. List any five functions of the pandas library with execution.

import pandas as pd
data = {'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}
df = pd.DataFrame(data)

print("head():\n", df.head(2))
print("\ndescribe():\n", df.describe())
print("\ndrop():\n", df.drop('B', axis=1))

df_with_nan = df.copy()
df_with_nan.loc[1, 'B'] = None
print("\nfillna():\n", df_with_nan.fillna(0))

grouped = df.groupby('A').sum()
print("\ngroupby():\n", grouped)


head():
    A  B  C
0  1  4  7
1  2  5  8

describe():
          A    B    C
count  3.0  3.0  3.0
mean   2.0  5.0  8.0
std    1.0  1.0  1.0
min    1.0  4.0  7.0
25%    1.5  4.5  7.5
50%    2.0  5.0  8.0
75%    2.5  5.5  8.5
max    3.0  6.0  9.0

drop():
    A  C
0  1  7
1  2  8
2  3  9

fillna():
    A    B  C
0  1  4.0  7
1  2  0.0  8
2  3  6.0  9

groupby():
    B  C
A      
1  4  7
2  5  8
3  6  9


In [13]:
# Q2. Given a Pandas DataFrame df with columns 'A', 'B', and 'C', write a Python function to re-index the
# DataFrame with a new index that starts from 1 and increments by 2 for each row.

import pandas as pd

def reindex_dataframe(df):
    new_index = range(1, len(df) * 2, 2)
    df = df.set_index(pd.Index(new_index))
    return df

df = pd.DataFrame({
    'A': [10, 20, 30],
    'B': [40, 50, 60],
    'C': [70, 80, 90]
})

print("Original DataFrame:\n", df)

reindexed_df = reindex_dataframe(df)
print("\nRe-indexed DataFrame:\n", reindexed_df)


Original DataFrame:
     A   B   C
0  10  40  70
1  20  50  80
2  30  60  90

Re-indexed DataFrame:
     A   B   C
1  10  40  70
3  20  50  80
5  30  60  90


In [3]:
# Q3. You have a Pandas DataFrame df with a column named 'Values'. Write a Python function that
# iterates over the DataFrame and calculates the sum of the first three values in the 'Values' column. The
# function should print the sum to the console. For example, if the 'Values' column of df contains the values [10, 20, 30, 40, 50], your function should
# calculate and print the sum of the first three values, which is 60.

import pandas as pd

def sum_first_three(df):
    first_three_values = df['Values'].iloc[:3]
    result = first_three_values.sum()
    print("Sum of the first three values:", result)

df = pd.DataFrame({'Values': [10, 20, 30, 40, 50]})
sum_first_three(df)


Sum of the first three values: 60


In [5]:
# Q4. Given a Pandas DataFrame df with a column 'Text', write a Python function to create a new column
# 'Word_Count' that contains the number of words in each row of the 'Text' column.

import pandas as pd

def add_word_count_column(df):
    df['Word_Count'] = df['Text'].apply(lambda x: len(str(x).split()))
    return df

df = pd.DataFrame({'Text': ['Hello world', 'Pandas is great', 'Data science is fun']})
updated_df = add_word_count_column(df)
print(updated_df)


                  Text  Word_Count
0          Hello world           2
1      Pandas is great           3
2  Data science is fun           4


In [None]:
# Q5. How are DataFrame.size() and DataFrame.shape() different?

DataFrame.size: Represents the total number of elements in the DataFrame. It can be calculated as rows × columns.

DataFrame.shape: Represents the dimensions of the DataFrame. It returns a tuple (rows,columns).


In [None]:
# Q6. Which function of pandas do we use to read an excel file?

The function to read an Excel file is pd.read_excel().

In [6]:
# Q7. You have a Pandas DataFrame df that contains a column named 'Email' that contains email
# addresses in the format 'username@domain.com'. Write a Python function that creates a new column
# 'Username' in df that contains only the username part of each email address.
# The username is the part of the email address that appears before the '@' symbol. For example, if the
# email address is 'john.doe@example.com', the 'Username' column should contain 'john.doe'. Your
# function should extract the username from each email address and store it in the new 'Username' column.

import pandas as pd

def extract_username(df):
    df['Username'] = df['Email'].apply(lambda x: x.split('@')[0])
    return df

df = pd.DataFrame({'Email': ['john.doe@example.com', 'jane.smith@test.com', 'bob.jones@company.org']})
updated_df = extract_username(df)
print(updated_df)


                   Email    Username
0   john.doe@example.com    john.doe
1    jane.smith@test.com  jane.smith
2  bob.jones@company.org   bob.jones


In [7]:
# Q8. You have a Pandas DataFrame df with columns 'A', 'B', and 'C'. Write a Python function that selects
# all rows where the value in column 'A' is greater than 5 and the value in column 'B' is less than 10. The
# function should return a new DataFrame that contains only the selected rows.

import pandas as pd

def filter_rows(df):
    filtered_df = df[(df['A'] > 5) & (df['B'] < 10)]
    return filtered_df

df = pd.DataFrame({
    'A': [3, 8, 6, 2, 9],
    'B': [5, 2, 9, 3, 1],
    'C': [1, 7, 4, 5, 2]
})

filtered_df = filter_rows(df)
print(filtered_df)


   A  B  C
1  8  2  7
2  6  9  4
4  9  1  2


In [8]:
# Q9. Given a Pandas DataFrame df with a column 'Values', write a Python function to calculate the mean, median, and standard deviation of the values in the 'Values' column.

import pandas as pd

def calculate_statistics(df):
    mean_value = df['Values'].mean()
    median_value = df['Values'].median()
    std_dev_value = df['Values'].std()
    print("Mean:", mean_value)
    print("Median:", median_value)
    print("Standard Deviation:", std_dev_value)

df = pd.DataFrame({'Values': [10, 20, 30, 40, 50]})
calculate_statistics(df)


Mean: 30.0
Median: 30.0
Standard Deviation: 15.811388300841896


In [9]:
# Q10. Given a Pandas DataFrame df with a column 'Sales' and a column 'Date', write a Python function to
# create a new column 'MovingAverage' that contains the moving average of the sales for the past 7 days
# for each row in the DataFrame. The moving average should be calculated using a window of size 7 and
# should include the current day.

import pandas as pd

def calculate_moving_average(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('Date')
    df['MovingAverage'] = df['Sales'].rolling(window=7, min_periods=1).mean()
    return df

df = pd.DataFrame({
    'Sales': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'Date': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05', 
             '2024-01-06', '2024-01-07', '2024-01-08', '2024-01-09', '2024-01-10']
})

df_with_moving_average = calculate_moving_average(df)
print(df_with_moving_average)

   Sales       Date  MovingAverage
0     10 2024-01-01           10.0
1     20 2024-01-02           15.0
2     30 2024-01-03           20.0
3     40 2024-01-04           25.0
4     50 2024-01-05           30.0
5     60 2024-01-06           35.0
6     70 2024-01-07           40.0
7     80 2024-01-08           50.0
8     90 2024-01-09           60.0
9    100 2024-01-10           70.0


In [10]:
# Q11. You have a Pandas DataFrame df with a column 'Date'. Write a Python function that creates a new
# column 'Weekday' in the DataFrame. The 'Weekday' column should contain the weekday name (e.g.
# Monday, Tuesday) corresponding to each date in the 'Date' column.

import pandas as pd

def add_weekday_column(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Weekday'] = df['Date'].dt.day_name()
    return df

df = pd.DataFrame({
    'Date': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05']
})

df_with_weekday = add_weekday_column(df)
print(df_with_weekday)


        Date    Weekday
0 2024-01-01     Monday
1 2024-01-02    Tuesday
2 2024-01-03  Wednesday
3 2024-01-04   Thursday
4 2024-01-05     Friday


In [11]:
# Q12. Given a Pandas DataFrame df with a column 'Date' that contains timestamps, write a Python
# function to select all rows where the date is between '2023-01-01' and '2023-01-31'.

import pandas as pd

def select_date_range(df):
    df['Date'] = pd.to_datetime(df['Date'])
    start_date = pd.to_datetime('2023-01-01')
    end_date = pd.to_datetime('2023-01-31')
    filtered_df = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
    return filtered_df

df = pd.DataFrame({
    'Date': ['2023-01-01 10:00:00', '2023-01-15 12:00:00', '2023-02-01 08:00:00', '2023-01-30 16:00:00'],
    'Value': [100, 200, 300, 400]
})

filtered_df = select_date_range(df)
print(filtered_df)


                 Date  Value
0 2023-01-01 10:00:00    100
1 2023-01-15 12:00:00    200
3 2023-01-30 16:00:00    400


In [12]:
# Q13. To use the basic functions of pandas, what is the first and foremost necessary library that needs to be imported?

# The first and foremost necessary library that needs to be imported to use the basic functions of Pandas is:

import pandas as pd
