In [2]:
#Q1. List any five functions of the pandas library with execution.

import pandas as pd

data = {'Name': ['Alice', 'Bob', 'Claire', 'David', 'Eve'],
        'Age': [25, 30, 27, None, 29],
        'Gender': ['Female', 'Male', 'Female', 'Male', 'Female']}
df = pd.DataFrame(data)

#1. head(): Returns the first n rows of the DataFrame.
print(df.head(3)) # to display the first 3 rows
print()

#2. describe(): Generates descriptive statistics of the DataFrame.
print(df.describe())
print()

#3. info(): Provides a summary of the DataFrame.
print(df.info())
print()

#4. groupby(): Groups the data based on a column and applies an aggregation function.
grouped_df = df.groupby('Gender')['Age'].mean()
print()

#5. fillna(): Fills missing values with a specified value.
filled_df = df.fillna(0)
print(filled_df)

     Name   Age  Gender
0   Alice  25.0  Female
1     Bob  30.0    Male
2  Claire  27.0  Female

             Age
count   4.000000
mean   27.750000
std     2.217356
min    25.000000
25%    26.500000
50%    28.000000
75%    29.250000
max    30.000000

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    5 non-null      object 
 1   Age     4 non-null      float64
 2   Gender  5 non-null      object 
dtypes: float64(1), object(2)
memory usage: 252.0+ bytes
None


     Name   Age  Gender
0   Alice  25.0  Female
1     Bob  30.0    Male
2  Claire  27.0  Female
3   David   0.0    Male
4     Eve  29.0  Female


In [3]:
'''Q2. Given a Pandas DataFrame df with columns 'A', 'B', and 'C', write a Python function to re-index the DataFrame with a new index that starts from 1 and increments by 2 for each row.'''

def reindex_df(df):
    new_index = range(1, 2*len(df) + 1, 2)
    df.index = new_index
    
    return df

data = {
    'A': [10, 20, 30, 40],
    'B': [15, 25, 35, 45],
    'C': [20, 30, 40, 50]
}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

print("\nRe-indexed DataFrame:")
print(reindex_df(df))

Original DataFrame:
    A   B   C
0  10  15  20
1  20  25  30
2  30  35  40
3  40  45  50

Re-indexed DataFrame:
    A   B   C
1  10  15  20
3  20  25  30
5  30  35  40
7  40  45  50


In [5]:
'''Q3. You have a Pandas DataFrame df with a column named 'Values'. Write a Python function that iterates over the DataFrame and calculates the sum of the first three values in the 'Values' column. The function should print the sum to the console.

For example, if the 'Values' column of df contains the values [10, 20, 30, 40, 50], your function should calculate and print the sum of the first three values, which is 60.
'''

def sum_first_three_values(df):
    if 'Values' in df.columns and len(df['Values']) >= 3:
        sum_of_values = df['Values'].iloc[:3].sum()
        print("The sum of first three values is:", sum_of_values)
    else:
        print("The 'Values' column does not exist or does not have at least three values.")

data = {'Values': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)

sum_first_three_values(df)

The sum of first three values is: 60


In [6]:
'''Q4. Given a Pandas DataFrame df with a column 'Text', write a Python function to create a new column 'Word_Count' that contains the number of words in each row of the 'Text' column.
'''

def add_word_count_column(df):

    def word_count(text):

        if pd.isna(text):
            return 0
        
        return len(text.split())
    
    df['Word_Count'] = df['Text'].apply(word_count)

    return df

data = {
    'Text': ['Mindset is everything', 'Begin anywhere', 'Escape the ordinary', None, 'Live every moment']
}
df = pd.DataFrame(data)

df = add_word_count_column(df)
print(df)


                    Text  Word_Count
0  Mindset is everything           3
1         Begin anywhere           2
2    Escape the ordinary           3
3                   None           0
4      Live every moment           3


In [7]:
'''Q5. How are DataFrame.size() and DataFrame.shape() different?

Answer:- Both properties provide valuable information but in different contexts: 

DataFrame.size() property returns the number of elements in the DataFrame.
-> It is calculated as the product of the number of rows and the number of columns (i.e., rows * columns).
-> It returns a single integer.

DataFrame.shape() property returns a tuple representing the dimensionality of the DataFrame.
-> The tuple consists of two values: the number of rows and the number of columns (i.e., (rows, columns)).
-> It returns a tuple of integers.
'''
#Example: 

data = {
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
}
df = pd.DataFrame(data)

print("DataFrame.size:", df.size)

print("DataFrame.shape:", df.shape)

DataFrame.size: 9
DataFrame.shape: (3, 3)


In [9]:
'''Q6. Which function of pandas do we use to read an excel file?

Answer:- To read an Excel file in Pandas, we use the 'read_excel()' function. This function allows you to read data from an Excel file into a Pandas DataFrame.
'''

"Q6. Which function of pandas do we use to read an excel file?\n\nAnswer:- To read an Excel file in Pandas, we use the 'read_excel()' function. This function allows you to read data from an Excel file into a Pandas DataFrame.\n"

In [10]:
'''Q7. You have a Pandas DataFrame df that contains a column named 'Email' that contains email addresses in the format 'username@domain.com'. Write a Python function that creates a new column 'Username' in df that contains only the username part of each email address.

The username is the part of the email address that appears before the '@' symbol. For example, if the email address is 'john.doe@example.com', the 'Username' column should contain 'john.doe'. Your function should extract the username from each email address and store it in the new 'Username' column.
'''

def extract_username(df):

    def get_username(email):
        return email.split('@')[0]
    
    df['Username'] = df['Email'].apply(get_username)

    return df

data = {
    'Email': ['john.doe@example.com', 'clara.smith@sample.com', 'rony123@test.org']
}
df = pd.DataFrame(data)

df = extract_username(df)

print(df)


                    Email     Username
0    john.doe@example.com     john.doe
1  clara.smith@sample.com  clara.smith
2        rony123@test.org      rony123


In [11]:
'''Q8. You have a Pandas DataFrame df with columns 'A', 'B', and 'C'. Write a Python function that selects all rows where the value in column 'A' is greater than 5 and the value in column 'B' is less than 10. The function should return a new DataFrame that contains only the selected rows.

For example, if df contains the following values:

  A B C

0 3 5 1

1 8 2 7

2 6 9 4

3 2 3 5

4 9 1 2

Your function should select the following rows: A B C

1 8 2 7

4 9 1 2

The function should return a new DataFrame that contains only the selected rows.
'''

def select_rows(df):
    filtered_df = df[(df['A'] > 5) & (df['B'] < 10)]
    return filtered_df

data = {
    'A': [3, 8, 6, 2, 9],
    'B': [5, 2, 9, 3, 1],
    'C': [1, 7, 4, 5, 2]
}

df = pd.DataFrame(data)

selected_rows = select_rows(df)
print(selected_rows)

   A  B  C
1  8  2  7
2  6  9  4
4  9  1  2


In [15]:
'''Q9. Given a Pandas DataFrame df with a column 'Values', write a Python function to calculate the mean, median, and standard deviation of the values in the 'Values' column.
'''

def calculate_statistics(df):
    mean_value = df['Values'].mean()
    median_value = df['Values'].median()
    std_dev_value = df['Values'].std()
    
    return mean_value, median_value, std_dev_value

data = {
    'Values': [10, 2, 13, 4, 50, 35, 70]
}
df = pd.DataFrame(data)

mean, median, std_dev = calculate_statistics(df)

print(f"Mean: {mean}")
print(f"Median: {median}")
print(f"Standard Deviation: {std_dev}")



Mean: 26.285714285714285
Median: 13.0
Standard Deviation: 26.068590843607726


In [16]:
'''Q10. Given a Pandas DataFrame df with a column 'Sales' and a column 'Date', write a Python function to create a new column 'MovingAverage' that contains the moving average of the sales for the past 7 days for each row in the DataFrame. The moving average should be calculated using a window of size 7 and should include the current day.
'''
def add_moving_average(df):

    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('Date')
    df['MovingAverage'] = df['Sales'].rolling(window=7, min_periods = 1).mean()
    
    return df

data = {
    'Date': ['01-01-2023', '01-02-2023', '01-03-2023', '01-04-2023','01-05-2023', '01-06-2023', '01-07-2023', '01-08-2023', '01-09-2023','01-10-2023'],
    'Sales': [100, 150, 200, 250, 300, 350, 400, 450, 500, 550]
}
df = pd.DataFrame(data)

df = add_moving_average(df)
print(df)

        Date  Sales  MovingAverage
0 2023-01-01    100          100.0
1 2023-01-02    150          125.0
2 2023-01-03    200          150.0
3 2023-01-04    250          175.0
4 2023-01-05    300          200.0
5 2023-01-06    350          225.0
6 2023-01-07    400          250.0
7 2023-01-08    450          300.0
8 2023-01-09    500          350.0
9 2023-01-10    550          400.0


In [17]:
'''Q11. You have a Pandas DataFrame df with a column 'Date'. Write a Python function that creates a new column 'Weekday' in the DataFrame. The 'Weekday' column should contain the weekday name (e.g.Monday, Tuesday) corresponding to each date in the 'Date' column.

For example, if df contains the following values:

    Date

0 2023-01-01

1 2023-01-02

2 2023-01-03

3 2023-01-04

4 2023-01-05

Your function should create the following DataFrame:


   Date      Weekday

0 2023-01-01 Sunday

1 2023-01-02 Monday

2 2023-01-03 Tuesday

3 2023-01-04 Wednesday

4 2023-01-05 Thursday

The function should return the modified DataFrame.'''

def add_weekday_column(df):

    df['Date'] = pd.to_datetime(df['Date'])
    df['Weekday'] = df['Date'].dt.day_name()  #to extracts the full weekday name from each date in the 'Date' column

    return df

data = {
    'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']
}
df = pd.DataFrame(data)

df = add_weekday_column(df)
print(df)

        Date    Weekday
0 2023-01-01     Sunday
1 2023-01-02     Monday
2 2023-01-03    Tuesday
3 2023-01-04  Wednesday
4 2023-01-05   Thursday


In [18]:
'''Q12. Given a Pandas DataFrame df with a column 'Date' that contains timestamps, write a Python function to select all rows where the date is between '2023-01-01' and '2023-01-31'.
'''

def filter_by_date_range(df, start_date, end_date):

    df['Date'] = pd.to_datetime(df['Date'])
    
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    
    filtered_df = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
    
    return filtered_df

data = {
    'Date': ['2023-01-01', '2023-01-15', '2023-02-01', '2023-01-31','2023-03-01'],
    'Value': [10, 20, 30, 40, 50]
}
df = pd.DataFrame(data)

start_date = '2023-01-01'
end_date = '2023-01-31'


filtered_df = filter_by_date_range(df, start_date, end_date)
print(filtered_df)

        Date  Value
0 2023-01-01     10
1 2023-01-15     20
3 2023-01-31     40


In [19]:
'''Q13. To use the basic functions of pandas, what is the first and foremost necessary library that needs to be imported?

Answer:- To use the basic functions of Pandas in Python, we need to import the Pandas library. The first and foremost necessary library to import is:
'''

import pandas as pd