In [214]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import seaborn as sns

# Load the Titanic dataset directly from seaborn
df = sns.load_dataset('titanic')
df.head(1)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False


In [215]:
ans = df.drop(columns=[
                       'survived', 'pclass', 'age', 'sibsp',
                       'parch', 'who', 'adult_male', 'alive', 
                       'alone', 'embark_town'
                       ]
              )
ans.head(5) 

Unnamed: 0,sex,fare,embarked,class,deck
0,male,7.25,S,Third,
1,female,71.2833,C,First,C
2,female,7.925,S,Third,
3,female,53.1,S,First,C
4,male,8.05,S,Third,


In [216]:
Ans = ans.filter(items=['sex', 'embarked', 'deck', 'class', 'fare'])

In [217]:
X_train, X_test, y_train, y_test = train_test_split(Ans.drop(columns=['fare']), 
                                                    Ans['fare'], test_size=0.2,  random_state=0)


In [218]:
Ans.isna().sum()

sex           0
embarked      2
deck        688
class         0
fare          0
dtype: int64

In [219]:
Trf1 = ColumnTransformer([
     ("imputer_embarked", SimpleImputer(strategy='most_frequent'), [1]),
     ("imputer_deck", SimpleImputer(strategy='most_frequent'), [2]),
], remainder='passthrough')

Trf2 = ColumnTransformer([
     ("Ordinal", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), [3]),
     ("OneHot", OneHotEncoder(handle_unknown='ignore', sparse_output=False), [0]),
     ("OneHots", OneHotEncoder(handle_unknown='ignore', sparse_output=False), [1]),
     ("OnesHot", OneHotEncoder(handle_unknown='ignore', sparse_output=False), [2])
], remainder='passthrough')

Trf3 = ColumnTransformer([
     ("Scale", MinMaxScaler(), slice(0, 12))
])

Trf4 = SelectKBest(score_func=chi2, k = 5)
Trf5 = RandomForestRegressor()
from sklearn.pipeline import Pipeline
Pipes = Pipeline([
     ("Imputer",Trf1),
     ("Encoder",Trf2),
     ("Scaler",Trf3),
     ("Sfeatures",Trf4),
      ("Model",Trf5),

])
Pipes

In [220]:
Ans.isnull().sum()

sex           0
embarked      2
deck        688
class         0
fare          0
dtype: int64

In [221]:
Trf2.set_output(transform='pandas')

In [222]:
Trf2.fit_transform(X_train)

Unnamed: 0,Ordinal__class,OneHot__sex_female,OneHot__sex_male,OneHots__embarked_C,OneHots__embarked_Q,OneHots__embarked_S,OneHots__embarked_nan,OnesHot__deck_A,OnesHot__deck_B,OnesHot__deck_C,OnesHot__deck_D,OnesHot__deck_E,OnesHot__deck_F,OnesHot__deck_G,OnesHot__deck_nan
140,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
439,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
817,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
378,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
491,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
192,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
629,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
559,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [223]:
import seaborn as sns

# Load the Titanic dataset directly from seaborn
data = sns.load_dataset('titanic')
data.head(1)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False


In [224]:
data.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [225]:
Demo = data.filter(items=['embarked','deck', 'embark_town', 'who', 'adult_male',  'alive', 'alone','class', 'fare'])
Demo.isna().sum()

embarked         2
deck           688
embark_town      2
who              0
adult_male       0
alive            0
alone            0
class            0
fare             0
dtype: int64

In [226]:
Demo['deck'].value_counts()

C    59
B    47
D    33
E    32
A    15
F    13
G     4
Name: deck, dtype: int64

In [227]:
Demo['deck'] = Demo['deck'].fillna('C', inplace=True)

In [228]:
X_train, X_test ,  y_train, y_test = train_test_split(Demo.drop(columns = ['fare']), Demo['fare'], test_size=0.2, random_state=42)

In [229]:
Demo[Demo['deck'].isnull()]

Unnamed: 0,embarked,deck,embark_town,who,adult_male,alive,alone,class,fare
0,S,,Southampton,man,True,no,False,Third,7.2500
1,C,,Cherbourg,woman,False,yes,False,First,71.2833
2,S,,Southampton,woman,False,yes,True,Third,7.9250
3,S,,Southampton,woman,False,yes,False,First,53.1000
4,S,,Southampton,man,True,no,True,Third,8.0500
...,...,...,...,...,...,...,...,...,...
886,S,,Southampton,man,True,no,True,Second,13.0000
887,S,,Southampton,woman,False,yes,True,First,30.0000
888,S,,Southampton,woman,False,no,False,Third,23.4500
889,C,,Cherbourg,man,True,yes,True,First,30.0000


In [230]:
T1 = ColumnTransformer([
     ("Imputer_embarked", SimpleImputer(strategy='most_frequent'), [0]),
     ("Imputer_deck", SimpleImputer(strategy='most_frequent'), [1]),
     ("Imputer_embark_town", SimpleImputer(strategy='most_frequent'), [2])
], remainder='passthrough')

In [231]:
Demo.isnull().sum()

embarked         2
deck           891
embark_town      2
who              0
adult_male       0
alive            0
alone            0
class            0
fare             0
dtype: int64

In [232]:
Demo['embark_town'].value_counts()

Southampton    644
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

In [233]:
T2 = ColumnTransformer([
     ("Ordinal", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), [7]),
     ("OheHot", OneHotEncoder(handle_unknown='ignore', sparse_output=False), [0,1,2,3,4,5,6])
],  remainder='passthrough')

T2.set_output(transform='pandas')

In [234]:
T2.fit_transform(X_train)

Unnamed: 0,Ordinal__class,OheHot__embarked_C,OheHot__embarked_Q,OheHot__embarked_S,OheHot__embarked_nan,OheHot__deck_None,OheHot__embark_town_Cherbourg,OheHot__embark_town_Queenstown,OheHot__embark_town_Southampton,OheHot__embark_town_nan,OheHot__who_child,OheHot__who_man,OheHot__who_woman,OheHot__adult_male_0.0,OheHot__adult_male_1.0,OheHot__alive_no,OheHot__alive_yes,OheHot__alone_0.0,OheHot__alone_1.0
331,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
733,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
382,2.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
704,2.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
813,2.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,2.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
270,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
860,2.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
435,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


In [235]:
T3 = ColumnTransformer([
     ("Scaler", MinMaxScaler(), slice(0, 26))
])

In [236]:
T4 = SelectKBest(score_func=chi2, k=10)


In [237]:
T5 = RandomForestRegressor()

In [238]:
pipes = Pipeline([
     ("Imputer", T1),
     ("Encoder", T2),
     ("Scaler", T3),
     ("Features", T4),
     ("Model", T5),
])

In [239]:
pipes

In [240]:
X_train.dtypes

embarked         object
deck             object
embark_town      object
who              object
adult_male         bool
alive            object
alone              bool
class          category
dtype: object

In [241]:
X_train

Unnamed: 0,embarked,deck,embark_town,who,adult_male,alive,alone,class
331,S,,Southampton,man,True,no,True,First
733,S,,Southampton,man,True,no,True,Second
382,S,,Southampton,man,True,no,True,Third
704,S,,Southampton,man,True,no,False,Third
813,S,,Southampton,child,False,no,False,Third
...,...,...,...,...,...,...,...,...
106,S,,Southampton,woman,False,yes,True,Third
270,S,,Southampton,man,True,no,True,First
860,S,,Southampton,man,True,no,False,Third
435,S,,Southampton,child,False,yes,False,First


In [242]:
y_train

331     28.5000
733     13.0000
382      7.9250
704      7.8542
813     31.2750
         ...   
106      7.6500
270     31.0000
860     14.1083
435    120.0000
102     77.2875
Name: fare, Length: 712, dtype: float64

In [243]:
import pandas as pd
import numpy as np

# Define the data with numeric labels for sentiment
data = {
    "Social Media Platform": ["Twitter", "Facebook", "Instagram", "Twitter", "Facebook",
                              "Instagram", "Twitter", "Facebook", "Instagram", "Twitter"],
    "Review": ["Love the new update!", "Too many ads now", "Great for sharing photos",
               "Newsfeed algorithm is biased", "Privacy concerns with latest update",
               "Amazing filters!", "Too much spam", "Easy to connect with friends",
               "Stories feature is fantastic", "Customer support lacking"],
    "age": [21, 19, np.nan, 17, 24, np.nan, 30, 19, 16, 31],
    "Sentiment": [1, 0, 1, 0, 0, 1, 0, 1, 1, 0]  # Numeric labels: 1 for Positive, 0 for Negative
}

# Create a DataFrame
df = pd.DataFrame(data)

print(df)

  Social Media Platform                               Review   age  Sentiment
0               Twitter                 Love the new update!  21.0          1
1              Facebook                     Too many ads now  19.0          0
2             Instagram             Great for sharing photos   NaN          1
3               Twitter         Newsfeed algorithm is biased  17.0          0
4              Facebook  Privacy concerns with latest update  24.0          0
5             Instagram                     Amazing filters!   NaN          1
6               Twitter                        Too much spam  30.0          0
7              Facebook         Easy to connect with friends  19.0          1
8             Instagram         Stories feature is fantastic  16.0          1
9               Twitter             Customer support lacking  31.0          0


In [244]:
def count_words(reviews):
     return np.array([len(review.split()) for  review in reviews]).reshape(-1, 1)


In [245]:
from sklearn.preprocessing import FunctionTransformer
word_count_transformer = FunctionTransformer(count_words)

In [246]:
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
feature_union = FeatureUnion([
     ('word_count', word_count_transformer),
     ('bag_of_words', CountVectorizer())
])

In [247]:
column_transformer = ColumnTransformer(
     transformers=[
          ('age_imputer', SimpleImputer(strategy='mean'), ['age']),
          ('platform_ohe', OneHotEncoder(), ['Social Media Platform']),
           ('review_processing', feature_union, 'Review')
     ], remainder='drop'
)

In [248]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_selection import SelectKBest,chi2

In [249]:
final_pipeline = Pipeline(steps=[
     ('col_transfer', column_transformer),
     ('scaler', MaxAbsScaler()),
     ('selector', SelectKBest(score_func=chi2, k =10)),
     ('classifier', LogisticRegression())
])

In [250]:
final_pipeline.fit(df.drop(columns=['Sentiment']), df['Sentiment'])