Q1. List any five functions of the pandas library with execution.

In [1]:
import pandas as pd

# 1. Reading data from a CSV file and creating a DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Claire'],
    'Age': [25, 30, 27],
    'Gender': ['Female', 'Male', 'Female']
}
df = pd.DataFrame(data)

# Saving the DataFrame to a CSV file
df.to_csv('data.csv', index=False)

# 2. Displaying the summary of the DataFrame
print("DataFrame Info:")
print(df.info())

# 3. Grouping the data by 'Name' and calculating the average score for each person
data_scores = {
    'Name': ['Alice', 'Bob', 'Claire', 'Bob', 'Alice'],
    'Score': [85, 92, 78, 88, 90]
}
df_scores = pd.DataFrame(data_scores)
grouped_df = df_scores.groupby('Name')['Score'].mean()

print("\nGrouped DataFrame:")
print(grouped_df)

# 4. Removing the 'Gender' column from the DataFrame
df = df.drop('Gender', axis=1)
print("\nDataFrame after dropping 'Gender' column:")
print(df)

# 5. Merging two DataFrames based on the 'ID' column
data1 = {'ID': [1, 2, 3], 'Subject': ['Math', 'Science', 'English']}
data2 = {'ID': [2, 3, 4], 'Score': [85, 92, 78]}
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

merged_df = pd.merge(df1, df2, on='ID', how='inner')
print("\nMerged DataFrame:")
print(merged_df)


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
 2   Gender  3 non-null      object
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes
None

Grouped DataFrame:
Name
Alice     87.5
Bob       90.0
Claire    78.0
Name: Score, dtype: float64

DataFrame after dropping 'Gender' column:
     Name  Age
0   Alice   25
1     Bob   30
2  Claire   27

Merged DataFrame:
   ID  Subject  Score
0   2  Science     85
1   3  English     92


Q2. Given a Pandas DataFrame df with columns 'A', 'B', and 'C', write a Python function to re-index the
DataFrame with a new index that starts from 1 and increments by 2 for each row.

In [2]:
import pandas as pd

def reindex_with_custom_increment(df):
    # Creating a new DataFrame with a custom index
    new_index = pd.RangeIndex(start=1, stop=2*len(df)+1, step=2)
    new_df = df.copy()  # Create a copy of the original DataFrame to avoid modifying it
    new_df.index = new_index
    
    return new_df

# Sample DataFrame with columns 'A', 'B', and 'C'
data = {
    'A': [10, 20, 30, 40],
    'B': [100, 200, 300, 400],
    'C': [1000, 2000, 3000, 4000]
}
df = pd.DataFrame(data)

# Re-indexing the DataFrame with a custom index
new_df = reindex_with_custom_increment(df)

# Printing the new DataFrame
print(new_df)


    A    B     C
1  10  100  1000
3  20  200  2000
5  30  300  3000
7  40  400  4000


Q3. You have a Pandas DataFrame df with a column named 'Values'. Write a Python function that
iterates over the DataFrame and calculates the sum of the first three values in the 'Values' column. The
function should print the sum to the console.

In [3]:
import pandas as pd

def calculate_sum_of_first_three(df):
    # Accessing the 'Values' column and calculating the sum of the first three values
    first_three_sum = df['Values'].iloc[:3].sum()
    
    # Printing the sum to the console
    print("Sum of the first three values in the 'Values' column:", first_three_sum)

# Sample DataFrame with a column 'Values'
data = {
    'Values': [10, 20, 30, 40, 50]
}
df = pd.DataFrame(data)

# Calling the function to calculate and print the sum of the first three values
calculate_sum_of_first_three(df)


Sum of the first three values in the 'Values' column: 60


Q4. Given a Pandas DataFrame df with a column 'Text', write a Python function to create a new column
'Word_Count' that contains the number of words in each row of the 'Text' column.

In [4]:
import pandas as pd

def count_words(text):
    # Split the text by whitespace and return the count of words
    return len(text.split())

def add_word_count_column(df):
    # Create a new column 'Word_Count' using the 'Text' column and the count_words function
    df['Word_Count'] = df['Text'].apply(count_words)
    return df

# Sample DataFrame with a column 'Text'
data = {
    'Text': ["This is a sample text.", "Count the words.", "Pandas is great!"]
}
df = pd.DataFrame(data)

# Call the function to create the 'Word_Count' column
df = add_word_count_column(df)

# Print the DataFrame with the new 'Word_Count' column
print(df)


                     Text  Word_Count
0  This is a sample text.           5
1        Count the words.           3
2        Pandas is great!           3


Q5. How are DataFrame.size() and DataFrame.shape() different?

In [6]:
import pandas as pd

# Sample DataFrame
data = {
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
}
df = pd.DataFrame(data)

# DataFrame.size() - Total number of elements (cells) in the DataFrame
print("DataFrame.size():", df.size)  

# DataFrame.shape() - Dimensions of the DataFrame
print("DataFrame.shape():", df.shape)  


DataFrame.size(): 9
DataFrame.shape(): (3, 3)


Q6. Which function of pandas do we use to read an excel file?

In [None]:
import pandas as pd

# Reading data from an Excel file and creating a DataFrame
df = pd.read_excel('Sample-Superstore.xlsx')


Q7. You have a Pandas DataFrame df that contains a column named 'Email' that contains email
addresses in the format 'username@domain.com'. Write a Python function that creates a new column
'Username' in df that contains only the username part of each email address.

In [11]:
import pandas as pd

def extract_username(email):
    # Split the email address by '@' and return the first part (username)
    return email.split('@')[0]

def add_username_column(df):
    # Create a new column 'Username' using the 'Email' column and the extract_username function
    df['Username'] = df['Email'].apply(extract_username)
    return df

# Sample DataFrame with a column 'Email'
data = {
    'Email': ['alice@example.com', 'bob@example.com', 'claire@example.com']
}
df = pd.DataFrame(data)

# Call the function to create the 'Username' column
df = add_username_column(df)

# Print the DataFrame with the new 'Username' column
print(df)


                Email Username
0   alice@example.com    alice
1     bob@example.com      bob
2  claire@example.com   claire


Q8. You have a Pandas DataFrame df with columns 'A', 'B', and 'C'. Write a Python function that selects
all rows where the value in column 'A' is greater than 5 and the value in column 'B' is less than 10. The
function should return a new DataFrame that contains only the selected rows.
For example, if df contains the following values:
A B C
0 3 5 1
1 8 2 7
2 6 9 4
3 2 3 5
4 9 1 2

In [12]:
import pandas as pd

def select_rows_condition(df):
    # Boolean indexing to select rows that meet the condition
    selected_rows = df[(df['A'] > 5) & (df['B'] < 10)]
    return selected_rows

# Sample DataFrame with columns 'A', 'B', and 'C'
data = {
    'A': [3, 8, 6, 2, 9],
    'B': [5, 2, 9, 3, 1],
    'C': [1, 7, 4, 5, 2]
}
df = pd.DataFrame(data)

# Call the function to select rows based on the condition
selected_df = select_rows_condition(df)

# Print the new DataFrame containing selected rows
print(selected_df)


   A  B  C
1  8  2  7
2  6  9  4
4  9  1  2


Q9. Given a Pandas DataFrame df with a column 'Values', write a Python function to calculate the mean,
median, and standard deviation of the values in the 'Values' column.

In [14]:
import pandas as pd

def calculate_statistics(df):
    # Calculate mean, median, and standard deviation of the 'Values' column
    mean_value = df['Values'].mean()
    median_value = df['Values'].median()
    std_value = df['Values'].std()
    
    return mean_value, median_value, std_value

# Sample DataFrame with a column 'Values'
data = {
    'Values': [19,19,19,20,20]
}
df = pd.DataFrame(data)

# Call the function to calculate statistics
mean, median, std = calculate_statistics(df)

# Print the results
print("Mean:", mean)
print("Median:", median)
print("Standard Deviation:", std)


Mean: 19.4
Median: 19.0
Standard Deviation: 0.5477225575051661


Q10. Given a Pandas DataFrame df with a column 'Sales' and a column 'Date', write a Python function to
create a new column 'MovingAverage' that contains the moving average of the sales for the past 7 days
for each row in the DataFrame. The moving average should be calculated using a window of size 7 and
should include the current day.

In [15]:
import pandas as pd

def calculate_moving_average(df):
    # Calculate the moving average of 'Sales' column with a window of size 7
    df['MovingAverage'] = df['Sales'].rolling(window=7, min_periods=1).mean()
    return df

# Sample DataFrame with columns 'Sales' and 'Date'
data = {
    'Sales': [100, 150, 120, 180, 200, 140, 160, 130, 170, 190],
    'Date': pd.date_range(start='2023-07-01', periods=10)
}
df = pd.DataFrame(data)

# Call the function to calculate the moving average
df = calculate_moving_average(df)

# Print the DataFrame with the new 'MovingAverage' column
print(df)


   Sales       Date  MovingAverage
0    100 2023-07-01     100.000000
1    150 2023-07-02     125.000000
2    120 2023-07-03     123.333333
3    180 2023-07-04     137.500000
4    200 2023-07-05     150.000000
5    140 2023-07-06     148.333333
6    160 2023-07-07     150.000000
7    130 2023-07-08     154.285714
8    170 2023-07-09     157.142857
9    190 2023-07-10     167.142857


Q11. You have a Pandas DataFrame df with a column 'Date'. Write a Python function that creates a new
column 'Weekday' in the DataFrame. The 'Weekday' column should contain the weekday name (e.g.
Monday, Tuesday) corresponding to each date in the 'Date' column.
For example, if df contains the following values:
Date
0 2023-01-01
1 2023-01-02
2 2023-01-03
3 2023-01-04
4 2023-01-05
Your function should create the following DataFrame:

Date Weekday
0 2023-01-01 Sunday
1 2023-01-02 Monday
2 2023-01-03 Tuesday
3 2023-01-04 Wednesday
4 2023-01-05 Thursday
The function should return the modified DataFrame.

In [19]:
import pandas as pd

def add_weekday_column(df):
    # Convert the 'Date' column to datetime if it is not already
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Create the 'Weekday' column containing the weekday names
    df['Weekday'] = df['Date'].dt.strftime('%A')
    
    return df

# Sample DataFrame with a column 'Date'
data = {
    'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']
}
df = pd.DataFrame(data)

# Call the function to create the 'Weekday' column
df = add_weekday_column(df)

# Print the DataFrame with the new 'Weekday' column
print(df)



        Date    Weekday
0 2023-01-01     Sunday
1 2023-01-02     Monday
2 2023-01-03    Tuesday
3 2023-01-04  Wednesday
4 2023-01-05   Thursday


Q12. Given a Pandas DataFrame df with a column 'Date' that contains timestamps, write a Python
function to select all rows where the date is between '2023-01-01' and '2023-01-31'.

In [20]:
import pandas as pd

def select_rows_between_dates(df):
    # Convert the 'Date' column to datetime if it is not already
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Define the start and end dates
    start_date = pd.to_datetime('2023-01-01')
    end_date = pd.to_datetime('2023-01-31')
    
    # Boolean indexing to select rows that meet the condition
    selected_rows = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
    return selected_rows

# Sample DataFrame with a column 'Date'
data = {
    'Date': ['2023-01-01', '2023-01-15', '2023-01-31', '2023-02-10']
}
df = pd.DataFrame(data)

# Call the function to select rows between the dates
selected_df = select_rows_between_dates(df)

# Print the DataFrame with selected rows
print(selected_df)


        Date
0 2023-01-01
1 2023-01-15
2 2023-01-31


Q13. To use the basic functions of pandas, what is the first and foremost necessary library that needs to
be imported?

In [21]:
'''To use the basic functions of Pandas, the first and foremost necessary library that needs to be imported is pandas itself. Pandas is a popular and powerful library in Python that provides data manipulation and analysis tools, including data structures like Series and DataFrame.

To import Pandas, you can use the following import statement:

python'''

import pandas as pd

'''The as pd part is optional but commonly used as a convention to create an alias for the pandas library, making it easier to reference Pandas functions and objects throughout the code using the shorter alias pd. This is the standard way to import Pandas, and you will see it commonly used in Pandas-related code examples and tutorials.'''






'The as pd part is optional but commonly used as a convention to create an alias for the pandas library, making it easier to reference Pandas functions and objects throughout the code using the shorter alias pd. This is the standard way to import Pandas, and you will see it commonly used in Pandas-related code examples and tutorials.'