<a href="https://colab.research.google.com/github/NandiniBasdwar7/concise/blob/main/day10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#Data Transformation
#apply, map, replace
#apply()-The apply() function in Pandas offers flexibility for applying a function to either rows, columns, or individual elements of a DataFrame or Series.Applies custom or built-in functions.
#Operates on rows (axis=1), columns (axis=0), or element-wise (implicitly)
import pandas as pd

df = pd.DataFrame({'A': [10, 20, 30], 'B': [5, 10, 15]})
df['C'] = df.apply(lambda row: row['A'] * row['B'], axis=1) # Calculate a new column C based on A and B
print(df)

    A   B    C
0  10   5   50
1  20  10  200
2  30  15  450


In [5]:
#Map-The map() function is used to transform or replace values in a Series based on a dictionary or a function. It's particularly useful when you have a defined set of values to replace or transform, often based on a mappin
import pandas as pd

df = pd.DataFrame({'Age': [25, 35, 20, 40]})

def age_category(age):
    return 'Young' if age < 30 else 'Old'

df['Age_Category'] = df['Age'].map(age_category)
print(df)


   Age Age_Category
0   25        Young
1   35          Old
2   20        Young
3   40          Old


In [6]:
import pandas as pd

# Create a sample DataFrame
data = {'Tweet': ['We are launching #newproduct!', 'Check out our latest news!', 'Exciting updates on %projectX', '@innovation is key']}
df = pd.DataFrame(data)

# Replace all hashtags using replace() with regex=True
df['Tweet'] = df['Tweet'].replace(to_replace=r'[#%@]\w+', value="", regex=True)
print(df)

                        Tweet
0          We are launching !
1  Check out our latest news!
2        Exciting updates on 
3                      is key


In [7]:
#Methods for adding columns:1. Direct assignment with a list of values
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df['new_column'] = [7, 8, 9]  # Add a new column 'new_column' with values [7, 8, 9]
print(df)



   A  B  new_column
0  1  4           7
1  2  5           8
2  3  6           9


In [32]:
#2. Using df.insert() to add a column at a specific index
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df.insert(2, 'C', [7, 8, 9])  # Insert a new column 'C' at index 1
print(df)

   A  B  C
0  1  4  7
1  2  5  8
2  3  6  9


In [13]:
#3. Using df.loc[] to create or update columns by label
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
print(df)
df.loc[:, 'D'] = [10, 11, 12]  # Create a new column 'D' using loc[]
print(df)


   A  B
0  1  4
1  2  5
2  3  6
   A  B   D
0  1  4  10
1  2  5  11
2  3  6  12


In [14]:
#4. Using df.assign() to add new columns and return a new DataFrame
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
new_df = df.assign(C=[13, 14, 15])  # Add a new column 'E' using assign()
print(new_df)
print(df)

   A  B   C
0  1  4  13
1  2  5  14
2  3  6  15
   A  B
0  1  4
1  2  5
2  3  6


In [15]:
#5. Creating columns with calculated values from existing columns
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df['C'] = df['A'] + df['B']  # Calculate 'C' as the sum of 'A' and 'B'
print(df)


   A  B  C
0  1  4  5
1  2  5  7
2  3  6  9


In [17]:
#1. Renaming-Columns: You can rename columns using the rename() method. This method takes a dictionary as an argument, where the keys are the old column names, and the values are the new column names.
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df = df.rename(columns={'A': 'First', 'B': 'Second'})
print(df)


   First  Second
0      1       4
1      2       5
2      3       6


In [20]:
#Indexes: You can rename the index of a DataFrame in a similar fashion using the rename() method. Pass a dictionary where the keys are the old index names and the values are the new index names to the index parameter.
import pandas as pd
df = pd.DataFrame({'Name': ['John', 'Alice'], 'Age': [25, 30]}, index=['a', 'b'])
df = df.rename(index={'a': 'FirstRow', 'b': 'SecondRow'})
print(df)


            Name  Age
FirstRow    John   25
SecondRow  Alice   30


In [21]:
#In-Place Renaming: You can apply the renaming changes directly to the original DataFrame without creating a new one by setting the inplace parameter to True within the rename() method.
df=pd.DataFrame({'Name':['aa','bb','cc'],'age':[22,34,24,]})
pd.DataFrame.rename(df,columns={'Name':'First','age':'Second'},inplace=True)
print(df)

  First  Second
0    aa      22
1    bb      34
2    cc      24


In [22]:
#Renaming-Renaming by Index (Position): To rename a column based on its numerical index (position) instead of its name
df=pd.DataFrame({'Name':['aa','bb','cc'],'age':[22,34,24,]})
new_columns=['Name','Age']
df.columns=new_columns
print(df)

  Name  Age
0   aa   22
1   bb   34
2   cc   24


In [24]:
#2. Indexing-Setting Index: The set_index() method allows you to set one or more columns as the DataFrame's index. This is particularly useful for making specific columns the row identifiers, aiding in data retrieval and alignment.
import pandas as pd

data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 35],
        'City': ['New York', 'San Francisco', 'Los Angeles']}

df = pd.DataFrame(data)
print(df)
df=df.set_index('Name')
print(df)

      Name  Age           City
0    Alice   25       New York
1      Bob   30  San Francisco
2  Charlie   35    Los Angeles
         Age           City
Name                       
Alice     25       New York
Bob       30  San Francisco
Charlie   35    Los Angeles


In [25]:
#Setting Multiple Columns as Index (MultiIndex): You can also set a combination of columns as the index, resulting in a MultiIndex or hierarchical index
import pandas as pd
df = pd.DataFrame({'Region': ['North', 'South'], 'Product': ['A', 'B'], 'Sales': [100, 150]})
df = df.set_index(['Region', 'Product'])
print(df)


                Sales
Region Product       
North  A          100
South  B          150


In [27]:
#Resetting Index:-The reset_index() method converts the index of a DataFrame back into columns, replacing it with the default integer-based index. This is useful when you want to treat the index as regular data or need to revert to a simpler index.
import pandas as pd

data = {'Region': ['North', 'South'], 'Product': ['A', 'B'], 'Sales': [100, 150]}
df = pd.DataFrame(data)
df = df.reset_index()
print(df)


   index Region Product  Sales
0      0  North       A    100
1      1  South       B    150


In [28]:
#Dropping the Index when Resetting: If you don't need the old index values as a column after resetting, you can use reset_index(drop=True)
data={'Region':['North','South'],'Product':['A','B'],'Sales':[100,150]}
df=pd.DataFrame(data)
df=df.reset_index(drop=True)
print(df)

  Region Product  Sales
0  North       A    100
1  South       B    150


In [30]:
import pandas as pd

# 1. Create a sample DataFrame (replace this with your actual data loading)
data = {
    'OrderID': [1, 2, 3, 4, 5],
    'Price': [100, 150, 75, 200, 120],
    'Quantity': [2, 1, 3, 1, 2],
    'Discount': [0.1, 0.05, 0.15, 0.2, 0.1]
}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

df['Normalized_Price'] = (df['Price'] - df['Price'].min()) / (df['Price'].max() - df['Price'].min())

df['Net_Revenue_Per_Order'] = (df['Price'] * df['Quantity']) * (1 - df['Discount'])

print("\nDataFrame with Normalized Price and Derived Metric:")
print(df)

Original DataFrame:
   OrderID  Price  Quantity  Discount
0        1    100         2      0.10
1        2    150         1      0.05
2        3     75         3      0.15
3        4    200         1      0.20
4        5    120         2      0.10

DataFrame with Normalized Price and Derived Metric:
   OrderID  Price  Quantity  Discount  Normalized_Price  Net_Revenue_Per_Order
0        1    100         2      0.10              0.20                 180.00
1        2    150         1      0.05              0.60                 142.50
2        3     75         3      0.15              0.00                 191.25
3        4    200         1      0.20              1.00                 160.00
4        5    120         2      0.10              0.36                 216.00


In [38]:
import pandas as pd

# Assuming 'df' is your DataFrame
# Example DataFrame (replace with your actual data)
data = {'Product': ['Milk', 'Bread', 'Coffee', 'Juice'],
        'Price': [2.50, 3.00, 5.00, 4.25],
        'Quantity': [10, 5, 8, 12],
        'Category': ['Dairy & Refrigerated', 'Baked Goods', 'Beverages', 'Beverages']}
df = pd.DataFrame(data)

# Make sure this line is executed
df['Revenue'] = df['Price'] * df['Quantity']

# Now define and apply your function
def classify_revenue(revenue):
    if revenue >= 50:
        return "high"
    elif revenue >= 25:
        return "medium"
    else:
        return "low"

df['Revenue_Class'] = df['Revenue'].apply(classify_revenue)
print(df)




  Product  Price  Quantity              Category  Revenue Revenue_Class
0    Milk   2.50        10  Dairy & Refrigerated     25.0        medium
1   Bread   3.00         5           Baked Goods     15.0           low
2  Coffee   5.00         8             Beverages     40.0        medium
3   Juice   4.25        12             Beverages     51.0          high
