In [23]:
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from scipy import stats
import numpy as np


ex 1

In [26]:
# Load the dataset
data = pd.read_csv("customers-100.csv", on_bad_lines='skip')


In [27]:
print(data.head(3))  # Print the first 3 rows



   Index      Customer Id First Name Last Name          Company  \
0      1  DD37Cf93aecA6Dc     Sheryl    Baxter  Rasmussen Group   
1      2  1Ef7b82A4CAAD10    Preston    Lozano      Vega-Gentry   
2      3  6F94879bDAfE5a6        Roy     Berry    Murillo-Perry   

                City              Country          Phone 1  \
0       East Leonard                Chile     229.077.5154   
1  East Jimmychester             Djibouti       5153435776   
2      Isabelborough  Antigua and Barbuda  +1-539-402-0259   

               Phone 2                     Email Subscription Date  \
0     397.884.0519x718  zunigavanessa@smith.info        2020-08-24   
1     686-620-1820x944           vmata@colon.com        2021-04-23   
2  (496)978-3969x58947       beckycarr@hogan.com        2020-03-25   

                      Website  
0  http://www.stephenson.com/  
1       http://www.hobbs.com/  
2    http://www.lawrence.com/  


In [28]:
print(data.tail(3))  # Print the last 3 rows


    Index      Customer Id First Name Last Name             Company  \
97     98  28CDbC0dFe4b1Db       Fred    Guerra       Schmitt-Jones   
98     99  c23d1D9EE8DEB0A     Yvonne    Farmer  Fitzgerald-Harrell   
99    100  2354a0E336A91A1   Clarence    Haynes  Le, Nash and Cross   

               City          Country               Phone 1  \
97       Ortegaland  Solomon Islands  +1-753-067-8419x7170   
98  Lake Elijahview            Aruba         (530)311-9786   
99        Judymouth         Honduras         (753)813-6941   

                   Phone 2                           Email Subscription Date  \
97   +1-632-666-7507x92121                swagner@kane.org        2021-09-18   
98  001-869-452-0943x12424  mccarthystephen@horn-green.biz        2021-08-11   
99            783.639.1472          colleen91@faulkner.biz        2020-03-11   

                              Website  
97              https://www.ross.com/  
98               http://watkins.info/  
99  http://www.hatfield-s

In [29]:
print(data.info())  # Print dataset information



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Index              100 non-null    int64 
 1   Customer Id        100 non-null    object
 2   First Name         100 non-null    object
 3   Last Name          100 non-null    object
 4   Company            100 non-null    object
 5   City               100 non-null    object
 6   Country            100 non-null    object
 7   Phone 1            100 non-null    object
 8   Phone 2            100 non-null    object
 9   Email              100 non-null    object
 10  Subscription Date  100 non-null    object
 11  Website            100 non-null    object
dtypes: int64(1), object(11)
memory usage: 9.5+ KB
None


In [32]:
# Check for missing values
missing_values = data.isnull()
print("Missing values:\n", missing_values)



Missing values:
     Index  Customer Id  First Name  Last Name  Company   City  Country  \
0   False        False       False      False    False  False    False   
1   False        False       False      False    False  False    False   
2   False        False       False      False    False  False    False   
3   False        False       False      False    False  False    False   
4   False        False       False      False    False  False    False   
..    ...          ...         ...        ...      ...    ...      ...   
95  False        False       False      False    False  False    False   
96  False        False       False      False    False  False    False   
97  False        False       False      False    False  False    False   
98  False        False       False      False    False  False    False   
99  False        False       False      False    False  False    False   

    Phone 1  Phone 2  Email  Subscription Date  Website  
0     False    False  False         

In [33]:
# Data types of each column
data_types = data.dtypes
print(data_types)

Index                 int64
Customer Id          object
First Name           object
Last Name            object
Company              object
City                 object
Country              object
Phone 1              object
Phone 2              object
Email                object
Subscription Date    object
Website              object
dtype: object


ex 2

In [34]:
# Counting missing values in the dataset
missing_counts = data.isnull().sum()

# Print the number of missing values for each column
print("\nNumber of missing values:\n", missing_counts)



Number of missing values:
 Index                0
Customer Id          0
First Name           0
Last Name            0
Company              0
City                 0
Country              0
Phone 1              0
Phone 2              0
Email                0
Subscription Date    0
Website              0
dtype: int64


In [35]:
# Remove rows with missing values from the dataset
data_dropna = data.dropna()

# Print the dataset after removing rows with missing values
print("After removing rows with missing values:\n", data_dropna)


After removing rows with missing values:
     Index      Customer Id First Name Last Name  \
0       1  DD37Cf93aecA6Dc     Sheryl    Baxter   
1       2  1Ef7b82A4CAAD10    Preston    Lozano   
2       3  6F94879bDAfE5a6        Roy     Berry   
3       4  5Cef8BFA16c5e3c      Linda     Olsen   
4       5  053d585Ab6b3159     Joanna    Bender   
..    ...              ...        ...       ...   
95     96  cb8E23e48d22Eae       Karl     Greer   
96     97  CeD220bdAaCfaDf       Lynn  Atkinson   
97     98  28CDbC0dFe4b1Db       Fred    Guerra   
98     99  c23d1D9EE8DEB0A     Yvonne    Farmer   
99    100  2354a0E336A91A1   Clarence    Haynes   

                            Company               City  \
0                   Rasmussen Group       East Leonard   
1                       Vega-Gentry  East Jimmychester   
2                     Murillo-Perry      Isabelborough   
3   Dominguez, Mcmillan and Donovan         Bensonview   
4          Martin, Lang and Andrade     West Priscilla 

In [36]:
# Fill missing values with the mean of each numeric column
data_fill_mean = data.fillna(data.mean(numeric_only=True))

# Print the dataset after filling missing values with the mean
print("After filling missing values with the mean:\n", data_fill_mean)


After filling missing values with the mean:
     Index      Customer Id First Name Last Name  \
0       1  DD37Cf93aecA6Dc     Sheryl    Baxter   
1       2  1Ef7b82A4CAAD10    Preston    Lozano   
2       3  6F94879bDAfE5a6        Roy     Berry   
3       4  5Cef8BFA16c5e3c      Linda     Olsen   
4       5  053d585Ab6b3159     Joanna    Bender   
..    ...              ...        ...       ...   
95     96  cb8E23e48d22Eae       Karl     Greer   
96     97  CeD220bdAaCfaDf       Lynn  Atkinson   
97     98  28CDbC0dFe4b1Db       Fred    Guerra   
98     99  c23d1D9EE8DEB0A     Yvonne    Farmer   
99    100  2354a0E336A91A1   Clarence    Haynes   

                            Company               City  \
0                   Rasmussen Group       East Leonard   
1                       Vega-Gentry  East Jimmychester   
2                     Murillo-Perry      Isabelborough   
3   Dominguez, Mcmillan and Donovan         Bensonview   
4          Martin, Lang and Andrade     West Priscil

In [37]:
# Forward fill missing values in the dataset
data_ffill = data.ffill()

# Print the dataset after forward filling missing values
print("After forward filling missing values:\n", data_ffill)


After forward filling missing values:
     Index      Customer Id First Name Last Name  \
0       1  DD37Cf93aecA6Dc     Sheryl    Baxter   
1       2  1Ef7b82A4CAAD10    Preston    Lozano   
2       3  6F94879bDAfE5a6        Roy     Berry   
3       4  5Cef8BFA16c5e3c      Linda     Olsen   
4       5  053d585Ab6b3159     Joanna    Bender   
..    ...              ...        ...       ...   
95     96  cb8E23e48d22Eae       Karl     Greer   
96     97  CeD220bdAaCfaDf       Lynn  Atkinson   
97     98  28CDbC0dFe4b1Db       Fred    Guerra   
98     99  c23d1D9EE8DEB0A     Yvonne    Farmer   
99    100  2354a0E336A91A1   Clarence    Haynes   

                            Company               City  \
0                   Rasmussen Group       East Leonard   
1                       Vega-Gentry  East Jimmychester   
2                     Murillo-Perry      Isabelborough   
3   Dominguez, Mcmillan and Donovan         Bensonview   
4          Martin, Lang and Andrade     West Priscilla   


In [38]:
# Backward fill missing values in the dataset
data_bfill = data.bfill()

# Print the dataset after backward filling missing values
print("After backward filling missing values:\n", data_bfill)


After backward filling missing values:
     Index      Customer Id First Name Last Name  \
0       1  DD37Cf93aecA6Dc     Sheryl    Baxter   
1       2  1Ef7b82A4CAAD10    Preston    Lozano   
2       3  6F94879bDAfE5a6        Roy     Berry   
3       4  5Cef8BFA16c5e3c      Linda     Olsen   
4       5  053d585Ab6b3159     Joanna    Bender   
..    ...              ...        ...       ...   
95     96  cb8E23e48d22Eae       Karl     Greer   
96     97  CeD220bdAaCfaDf       Lynn  Atkinson   
97     98  28CDbC0dFe4b1Db       Fred    Guerra   
98     99  c23d1D9EE8DEB0A     Yvonne    Farmer   
99    100  2354a0E336A91A1   Clarence    Haynes   

                            Company               City  \
0                   Rasmussen Group       East Leonard   
1                       Vega-Gentry  East Jimmychester   
2                     Murillo-Perry      Isabelborough   
3   Dominguez, Mcmillan and Donovan         Bensonview   
4          Martin, Lang and Andrade     West Priscilla   

ex 3

In [39]:
# Creating a DataFrame with 'Place' and 'Score' columns
df = pd.DataFrame({
    'Place': [1, 2, 3, 4, 5],
    'Score': [50, 40, 30, 20, 10]
})

# Normalizing data to a range between 0 and 1 using Min-Max Scaler
scaler = MinMaxScaler()

# Apply Min-Max scaling to the DataFrame
df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# Print the normalized DataFrame
print("Min-Max Scaled Data:\n", df_normalized)


Min-Max Scaled Data:
    Place  Score
0   0.00   1.00
1   0.25   0.75
2   0.50   0.50
3   0.75   0.25
4   1.00   0.00


In [40]:
df_gender = pd.DataFrame({
    'Gender': ['MALE', 'MALE', 'FEMALE', 'MALE', 'FEMALE']
})

df_encoded = pd.get_dummies(df_gender, columns=['Gender'])
print("One-hot Encoded Data:\n", df_encoded)


One-hot Encoded Data:
    Gender_FEMALE  Gender_MALE
0          False         True
1          False         True
2           True        False
3          False         True
4           True        False


In [41]:
df_continuous = pd.DataFrame({
    'Value': [1, 5, 10, 15, 20]
})

bins = [0, 5, 10, 15, 20]
labels = ['0-5', '5-10', '10-15', '15-20']

df_binned = pd.cut(df_continuous['Value'], bins=bins, labels=labels)
df_continuous['Binned'] = df_binned
print("Binned Data:\n", df_continuous)


Binned Data:
    Value Binned
0      1    0-5
1      5    0-5
2     10   5-10
3     15  10-15
4     20  15-20


ex 4

In [42]:
df = pd.DataFrame({
    'Feature1': [1, 2, 3, 4, 5],
    'Feature2': [10, 20, 30, 40, 50]
})

poly = PolynomialFeatures(degree=2, include_bias=False)
df_poly = pd.DataFrame(poly.fit_transform(df), columns=['Feature1', 'Feature2', 'Feature1^2', 'Feature1*Feature2', 'Feature2^2'])

print("Polynomial features:\n", df_poly)


Polynomial features:
    Feature1  Feature2  Feature1^2  Feature1*Feature2  Feature2^2
0       1.0      10.0         1.0               10.0       100.0
1       2.0      20.0         4.0               40.0       400.0
2       3.0      30.0         9.0               90.0       900.0
3       4.0      40.0        16.0              160.0      1600.0
4       5.0      50.0        25.0              250.0      2500.0


In [43]:
df_dates = pd.DataFrame({
    'Date': ['2024-09-12', '2024-10-01', '2024-12-25', '2025-01-15', '2025-03-20']
})

df_dates['Date'] = pd.to_datetime(df_dates['Date'])

df_dates['Year'] = df_dates['Date'].dt.year
df_dates['Month'] = df_dates['Date'].dt.month
df_dates['Day'] = df_dates['Date'].dt.day
df_dates['DayOfWeek'] = df_dates['Date'].dt.dayofweek
df_dates['Quarter'] = df_dates['Date'].dt.quarter

print("Data and received features:\n", df_dates)


Data and received features:
         Date  Year  Month  Day  DayOfWeek  Quarter
0 2024-09-12  2024      9   12          3        3
1 2024-10-01  2024     10    1          1        4
2 2024-12-25  2024     12   25          2        4
3 2025-01-15  2025      1   15          2        1
4 2025-03-20  2025      3   20          3        1


In [44]:
df_sales = pd.DataFrame({
    'SalePrice': [100, 150, 200, 300],
    'CostPrice': [70, 100, 150, 250]
})

df_sales['Profit'] = df_sales['SalePrice'] - df_sales['CostPrice']
df_sales['Discount'] = (df_sales['CostPrice'] / df_sales['SalePrice']) * 100

print("Sales with a new features:\n", df_sales)


Sales with a new features:
    SalePrice  CostPrice  Profit   Discount
0        100         70      30  70.000000
1        150        100      50  66.666667
2        200        150      50  75.000000
3        300        250      50  83.333333


ex 5

In [45]:
df = pd.DataFrame({
    'ID': [1, 2, 2, 4],
    'Name': ['Alice', 'Bob', 'Bob', 'David'],
    'Value': [10, 20, 20, 40]
})

print("Existing data frame:\n", df)

df_cleaned = df.drop_duplicates()
print("\ndata frame after deleting duplicates :\n", df_cleaned)


Existing data frame:
    ID   Name  Value
0   1  Alice     10
1   2    Bob     20
2   2    Bob     20
3   4  David     40

data frame after deleting duplicates :
    ID   Name  Value
0   1  Alice     10
1   2    Bob     20
3   4  David     40


In [46]:
# Creating a DataFrame with a column 'Value'
df = pd.DataFrame({
    'Value': [10, 12, 14, 15, 100]
})

# Calculate the Z-scores of the 'Value' column
z_scores = np.abs(stats.zscore(df['Value']))

# Print the Z-scores for each value
print("Z-scores:\n", z_scores)

# Remove rows where the Z-score is 3 or higher (indicating outliers)
df_no_outliers = df[(z_scores < 3)]

# Print the DataFrame after removing outliers
print("\nDataFrame after removing outliers:\n", df_no_outliers)


Z-scores:
 0    0.578097
1    0.520860
2    0.463622
3    0.435004
4    1.997582
Name: Value, dtype: float64

DataFrame after removing outliers:
    Value
0     10
1     12
2     14
3     15
4    100


In [47]:
# Calculate the Interquartile Range (IQR) to remove outliers
Q1 = df['Value'].quantile(0.25)  # First quartile (25th percentile)
Q3 = df['Value'].quantile(0.75)  # Third quartile (75th percentile)
IQR = Q3 - Q1  # Interquartile Range

# Define bounds for outlier detection
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove rows where 'Value' is outside the bounds (outliers)
df_no_outliers_iqr = df[(df['Value'] >= lower_bound) & (df['Value'] <= upper_bound)]

# Print the DataFrame after removing outliers using the IQR method
print("\nDataFrame after removing outliers (IQR method):\n", df_no_outliers_iqr)



DataFrame after removing outliers (IQR method):
    Value
0     10
1     12
2     14
3     15


In [48]:
# Creating a DataFrame with a column 'Category'
df = pd.DataFrame({
    'Category': ['A', 'b', 'B ', 'a', ' C']
})

# Strip leading and trailing spaces and convert to lowercase
df['Category'] = df['Category'].str.strip().str.lower()

# Replace inconsistent category values with standard ones
df['Category'] = df['Category'].replace({'a': 'A', 'b': 'B', 'c': 'C'})

# Print the DataFrame after correcting inconsistencies
print("DataFrame after correcting inconsistencies:\n", df)


DataFrame after correcting inconsistencies:
   Category
0        A
1        B
2        B
3        A
4        C


ex 6

In [50]:
# Creating a DataFrame with features and target variable
data = {
    'Feature1': [1, 2, 3, 4, 5, 6],
    'Feature2': [10, 20, 30, 40, 50, 60],
    'Target': [0, 1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)

# Separating features and target variable
X = df[['Feature1', 'Feature2']]
y = df['Target']

# Splitting the data into training and test sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Print the training sets and testing sets
print("Training sets:\n", X_train, y_train)
print("\nTesting sets:\n", X_test, y_test)


Training sets:
    Feature1  Feature2
5         6        60
2         3        30
4         5        50
3         4        40 5    1
2    0
4    0
3    1
Name: Target, dtype: int64

Testing sets:
    Feature1  Feature2
0         1        10
1         2        20 0    0
1    1
Name: Target, dtype: int64


In [51]:
# Splitting the data into training and test sets (80% training, 20% testing)
X_train_80, X_test_20, y_train_80, y_test_20 = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the training sets and testing sets for the 80-20 split
print("Training sets (80-20 split):\n", X_train_80, y_train_80)
print("\nTesting sets (80-20 split):\n", X_test_20, y_test_20)


Training sets (80-20 split):
    Feature1  Feature2
5         6        60
2         3        30
4         5        50
3         4        40 5    1
2    0
4    0
3    1
Name: Target, dtype: int64

Testing sets (80-20 split):
    Feature1  Feature2
0         1        10
1         2        20 0    0
1    1
Name: Target, dtype: int64


ex 7    

In [52]:
# Creating a DataFrame with features and target variable
data = {
    'age': [25, 30, None, 35, 40],
    'city': ['New York', 'Los Angeles', 'New York', None, 'Chicago'],
    'income': [50000, 60000, None, 80000, 70000],
    'purchased': [1, 0, 1, 0, 1]  # Target variable
}

df = pd.DataFrame(data)

# Separating features and target variable
X = df.drop('purchased', axis=1)
y = df['purchased']

# Defining preprocessing pipelines for numeric and categorical features
numeric_features = ['age', 'income']
categorical_features = ['city']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with the mean
    ('scaler', StandardScaler())  # Standardize numeric features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Combining preprocessing steps for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Creating a pipeline that first preprocesses the data and then applies a logistic regression model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())  # Logistic Regression model
])

# Splitting the data into training and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model
model.fit(X_train, y_train)

# Evaluating the model's accuracy on the test set
score = model.score(X_test, y_test)
print(f'Accuracy: {score:.2f}')


Accuracy: 0.00
