# Q1. List any five functions of the pandas library with execution.

In [1]:
import pandas as pd

# Create a sample CSV file
data = """Name,Age,Gender
Alice,25,Female
Bob,30,Male
Claire,27,Female"""

with open('sample_data.csv', 'w') as file:
    file.write(data)

# Read the CSV file into a DataFrame
df_csv = pd.read_csv('sample_data.csv')
print("DataFrame from CSV:\n", df_csv)


DataFrame from CSV:
      Name  Age  Gender
0   Alice   25  Female
1     Bob   30    Male
2  Claire   27  Female


In [2]:
# Display the first 2 rows of the DataFrame
print("\nFirst 2 rows of the DataFrame:\n", df_csv.head(2))



First 2 rows of the DataFrame:
     Name  Age  Gender
0  Alice   25  Female
1    Bob   30    Male


In [3]:
# Create a DataFrame with numerical data
df_numbers = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [5, 6, 7, 8, 9]
})

# Get descriptive statistics
print("\nDescriptive Statistics:\n", df_numbers.describe())



Descriptive Statistics:
               A         B
count  5.000000  5.000000
mean   3.000000  7.000000
std    1.581139  1.581139
min    1.000000  5.000000
25%    2.000000  6.000000
50%    3.000000  7.000000
75%    4.000000  8.000000
max    5.000000  9.000000


In [4]:
# Create a sample DataFrame
df_group = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Claire', 'Alice', 'Bob'],
    'Age': [25, 30, 27, 26, 31],
})

# Group by 'Name' and calculate the average age
average_age = df_group.groupby('Name')['Age'].mean().reset_index()
print("\nAverage Age by Name:\n", average_age)



Average Age by Name:
      Name   Age
0   Alice  25.5
1     Bob  30.5
2  Claire  27.0


In [5]:
# Sort the DataFrame by Age
sorted_df = df_csv.sort_values(by='Age')
print("\nSorted DataFrame by Age:\n", sorted_df)



Sorted DataFrame by Age:
      Name  Age  Gender
0   Alice   25  Female
2  Claire   27  Female
1     Bob   30    Male


# Q2. Given a Pandas DataFrame df with columns 'A', 'B', and 'C', write a Python function to re-index the
DataFrame with a new index that starts from 1 and increments by 2 for each row.

In [6]:
import pandas as pd

def reindex_dataframe(df):
    # Create a new index starting from 1 and incrementing by 2
    new_index = range(1, 2 * len(df) + 1, 2)

    # Re-index the DataFrame
    df_reindexed = df.copy()  # Create a copy to avoid modifying the original
    df_reindexed.index = new_index

    return df_reindexed

# Example DataFrame
data = {
    'A': [10, 20, 30],
    'B': [40, 50, 60],
    'C': [70, 80, 90]
}
df = pd.DataFrame(data)

# Re-index the DataFrame
df_reindexed = reindex_dataframe(df)

# Print the original and re-indexed DataFrame
print("Original DataFrame:\n", df)
print("\nRe-indexed DataFrame:\n", df_reindexed)


Original DataFrame:
     A   B   C
0  10  40  70
1  20  50  80
2  30  60  90

Re-indexed DataFrame:
     A   B   C
1  10  40  70
3  20  50  80
5  30  60  90


# Q3. You have a Pandas DataFrame df with a column named 'Values'. Write a Python function thatiterates over the DataFrame and calculates the sum of the first three values in the 'Values' column. The function should print the sum to the console.
#For example, if the 'Values' column of df contains the values [10, 20, 30, 40, 50], your function should calculate and print the sum of the first three values, which is 60.

In [7]:
import pandas as pd

def sum_first_three_values(df):
    # Check if the 'Values' column exists and has at least three entries
    if 'Values' in df.columns and len(df['Values']) >= 3:
        # Calculate the sum of the first three values
        total = sum(df['Values'].iloc[:3])
        print("Sum of the first three values:", total)
    else:
        print("The 'Values' column is missing or does not contain enough entries.")

# Example DataFrame
data = {
    'Values': [10, 20, 30, 40, 50]
}
df = pd.DataFrame(data)

# Call the function
sum_first_three_values(df)


Sum of the first three values: 60


#Q4. Given a Pandas DataFrame df with a column 'Text', write a Python function to create a new column 'Word_Count' that contains the number of words in each row of the 'Text' column.

In [8]:
import pandas as pd

def add_word_count_column(df):
    # Check if 'Text' column exists in the DataFrame
    if 'Text' in df.columns:
        # Create a new column 'Word_Count' that counts the words in each row of 'Text'
        df['Word_Count'] = df['Text'].apply(lambda x: len(str(x).split()))
    else:
        print("The 'Text' column is missing in the DataFrame.")

# Example DataFrame
data = {
    'Text': [
        "Hello world",
        "This is a sample text.",
        "Pandas is great for data analysis.",
        "How many words are in this sentence?"
    ]
}
df = pd.DataFrame(data)

# Call the function to add the 'Word_Count' column
add_word_count_column(df)

# Print the updated DataFrame
print(df)


                                   Text  Word_Count
0                           Hello world           2
1                This is a sample text.           5
2    Pandas is great for data analysis.           6
3  How many words are in this sentence?           7


# Q5. How are DataFrame.size() and DataFrame.shape() different?

DataFrame.size and DataFrame.shape are both attributes in Pandas used to obtain information about the size of a DataFrame, but they provide different types of information.

# Differences:
Return Value:

DataFrame.size: Returns the total number of elements in the DataFrame. This is calculated as the product of the number of rows and the number of columns.
DataFrame.shape: Returns a tuple representing the dimensions of the DataFrame. The tuple consists of two values: the number of rows and the number of columns.
Data Type:

DataFrame.size: Returns an integer value.
DataFrame.shape: Returns a tuple of two integers.
Examples:

In [9]:
import pandas as pd

# Create a sample DataFrame
data = {
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
}
df = pd.DataFrame(data)

# Get size and shape
total_elements = df.size
dimensions = df.shape

print("Total number of elements (size):", total_elements)
print("Dimensions (shape):", dimensions)


Total number of elements (size): 9
Dimensions (shape): (3, 3)


# Q6. Which function of pandas do we use to read an excel file?

To read an Excel file in Pandas, you use the pandas.read_excel() function. This function can read both .xls and .xlsx file formats.

Example Usage:
Here’s a simple example demonstrating how to use read_excel():

In [11]:
import pandas as pd

# Read an Excel file into a DataFrame
df = pd.read_excel('path_to_your_file.xlsx')

# Display the DataFrame
print(df)


# Q7. You have a Pandas DataFrame df that contains a column named 'Email' that contains emailaddresses in the format 'username@domain.com'. Write a Python function that creates a new column'Username' in df that contains only the username part of each email address.The username is the part of the email address that appears before the '@' symbol.

For example, if the
email address is 'john.doe@example.com', the 'Username' column should contain 'john.doe'. Your
function should extract the username from each email address and store it in the new 'Username'
column.

In [12]:
import pandas as pd

def add_username_column(df):
    # Check if 'Email' column exists in the DataFrame
    if 'Email' in df.columns:
        # Create a new column 'Username' by extracting the part before '@'
        df['Username'] = df['Email'].apply(lambda x: x.split('@')[0])
    else:
        print("The 'Email' column is missing in the DataFrame.")

# Example DataFrame
data = {
    'Email': [
        'john.doe@example.com',
        'alice.smith@domain.com',
        'bob.jones@test.com'
    ]
}
df = pd.DataFrame(data)

# Call the function to add the 'Username' column
add_username_column(df)

# Print the updated DataFrame
print(df)


                    Email     Username
0    john.doe@example.com     john.doe
1  alice.smith@domain.com  alice.smith
2      bob.jones@test.com    bob.jones


# Q8. You have a Pandas DataFrame df with columns 'A', 'B', and 'C'. Write a Python function that selectsall rows where the value in column 'A' is greater than 5 and the value in column 'B' is less than 10. The function should return a new DataFrame that contains only the selected rows.


For example, if df contains the following values:
A B C
0 3 5 1
1 8 2 7
2 6 9 4
3 2 3 5
4 9 1 2

In [13]:
import pandas as pd

def select_rows(df):
    # Select rows where column 'A' is greater than 5 and column 'B' is less than 10
    filtered_df = df[(df['A'] > 5) & (df['B'] < 10)]
    return filtered_df

# Example DataFrame
data = {
    'A': [3, 8, 6, 2, 9],
    'B': [5, 2, 9, 3, 1],
    'C': [1, 7, 4, 5, 2]
}
df = pd.DataFrame(data)

# Call the function to get the filtered DataFrame
result_df = select_rows(df)

# Print the result
print(result_df)


   A  B  C
1  8  2  7
2  6  9  4
4  9  1  2
