In [13]:
## CODE 1

# Import necessary libraries
import pandas as pd
import numpy as np
import string
import re
import unicodedata
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from sklearn.metrics import accuracy_score as acc
import joblib

# Initialize LabelEncoder
LE = LabelEncoder()

# Read the Excel file into a DataFrame
df = pd.read_csv("/kaggle/input/airlines/airlines.csv")

# Convert 'Meeting Notes' column to string type
df['Meeting Notes'] = df['Q1 - In your opinion, what should your organization start doing, or do more, to improve its Safety Culture?'].astype(str)

# Get unique topics from the 'Topic' column
unique_topics = df['HIERARCHY_cl'].unique()
print("Unique Topics:", unique_topics)


# Define functions to preprocess text data
def remove_URL(text):
    return re.sub(r"https?://\S+|www.\S+", "", text)

def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7f]', r'', text)

def remove_html(text):
    html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
    return re.sub(html, "", text)


# Print initial DataFrame shape and head
print("Initial DataFrame Shape:", df.shape)
print("Initial DataFrame Head:", df.head)

# Keep only relevant columns and drop rows with missing values
df = df[['Meeting Notes', "HIERARCHY_cl"]]
df.dropna(inplace=True)  # Added inplace=True to modify the DataFrame
print("DataFrame Shape After Dropping NaNs:", df.shape)

# Apply preprocessing functions to the 'Meeting Notes' column
df['Clean'] = df['Meeting Notes'].apply(remove_URL)
df['Clean'] = df['Clean'].apply(remove_non_ascii)
df['Clean'] = df['Clean'].apply(remove_html)

# Fit LabelEncoder to the 'Topic' column
le = LE.fit(df["HIERARCHY_cl"])
df["HIERARCHY_cl"] = le.transform(df["HIERARCHY_cl"])

# Separate features (x) and target labels (y)
x = df["Clean"]
y = df["HIERARCHY_cl"]

# Create a pipeline with TF-IDF vectorization and LightGBM classifier
lgb_model = Pipeline((
    ("vect", TfidfVectorizer()),
    ("clc", lgb.LGBMClassifier(boosting_type='gbdt', objective='multiclass'))
))
lgb_model.fit(x, y)

# Set threshold for prediction probabilities
thresholding = False  # Set this to True to enable thresholding
threshold = 0.90 if thresholding else 0.0

# Get prediction probabilities for training data
train_probabilities = lgb_model.predict_proba(x)
topic_names = le.inverse_transform(np.arange(len(lgb_model.classes_)))
result_df = pd.DataFrame(train_probabilities, columns=topic_names)

# Apply thresholding to predicted classes
predicted_classes = [topic_names[i] for i in np.argmax(train_probabilities, axis=1)]
filtered_classes = [cls if prob > threshold else 'Other' for cls, prob in zip(predicted_classes, np.max(train_probabilities, axis=1))]

# Add predicted classes and result probabilities to DataFrame
df["predictedtopic"] = filtered_classes
df = pd.concat([df, result_df], axis=1)
# Filter out rows with 'Other' predicted topic
df = df[df["predictedtopic"] != "Other"]
df = df.dropna()
# Calculate training accuracy
train_acc = acc(le.transform(df["predictedtopic"]), df['HIERARCHY_cl'])
print(df)
# Inverse transform the 'Topic' column back to its original values
df['HIERARCHY_cl'] = le.inverse_transform(df['HIERARCHY_cl'].astype(int))

# Rename the probability columns with topic names
for topic in topic_names:
    df[f'Probability_{topic}'] = df[topic]

# Remove redundant columns and export the DataFrame to an Excel file
df.drop(columns=topic_names, inplace=True)
df.to_excel("MFAtrainingfinal.xlsx", index=False)
print("Train Accuracy:", train_acc)
print("Topic Names:", topic_names)

# Save the trained model using joblib
joblib.dump(lgb_model, "model.pkl")

Unique Topics: ['FRONT LINE' 'Administrative' 'Management' nan
 'Senior Management: executive and divisional heads' 'Front Line'
 'SENIOR MANAGEMENT' 'ADMINISTRATIVE' 'MANAGEMENT']
Initial DataFrame Shape: (15607, 5)
Initial DataFrame Head: <bound method NDFrame.head of          HIERARCHY_cl ORGANIZATIONAL_CL  \
0          FRONT LINE               CAB   
1      Administrative               ORG   
2      Administrative               ORG   
3          FRONT LINE               MNT   
4          FRONT LINE               GRH   
...               ...               ...   
15602      FRONT LINE               NaN   
15603      FRONT LINE               NaN   
15604      FRONT LINE               NaN   
15605      FRONT LINE               NaN   
15606  ADMINISTRATIVE               NaN   

      Q1 - In your opinion, what should your organization start doing, or do more, to improve its Safety Culture?  \
0                                                    NaN                                       

['model.pkl']

In [7]:
df.columns

Index(['Meeting Notes', 'HIERARCHY_cl', 'Clean', 'predictedtopic',
       'ADMINISTRATIVE', 'Administrative', 'FRONT LINE', 'Front Line',
       'MANAGEMENT', 'Management', 'SENIOR MANAGEMENT',
       'Senior Management: executive and divisional heads'],
      dtype='object')

# Code 2 Just comments

In [None]:
Testing Notebook
important
For each function comment the function what is the function intend to do
comments hyper parameter tuning

** If i can setup a testing model just predict model wrapping the code into functions**

import pandas as pd
import numpy as np
import string
import re
import unicodedata
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score as acc
import lightgbm as lgb
import joblib

# Initialize LabelEncoder
LE = LabelEncoder()

# Read the test data from Excel
df = pd.read_excel("testdf.xlsx")

# Convert 'Meeting Notes' column to string type
df['Meeting Notes'] = df['Meeting Notes'].astype(str)

# Keep only relevant columns
df = df[['Meeting Notes', "Topic"]]

# Get unique topics from the 'Topic' column
unique_topics = df['Topic'].unique()


# Define functions to preprocess text data
def remove_URL(text):
    return re.sub(r"https?://\S+|www.\S+", "", text)

def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7f]', r'', text)

def remove_html(text):
    html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
    return re.sub(html, "", text)

# Apply preprocessing functions to the 'Meeting Notes' column
df['Clean'] = df['Meeting Notes'].apply(remove_URL)
df['Clean'] = df['Clean'].apply(remove_non_ascii)
df['Clean'] = df['Clean'].apply(remove_html)

# Encode the 'Topic' column using LabelEncoder
df["Topic"] = LE.fit_transform(df["Topic"])

# Separate features (x) and target labels (y)
x = df["Clean"]
y = df["Topic"]

# Load the pre-trained LightGBM model using joblib
lgb_model = joblib.load("model.pkl")

# Set threshold for prediction probabilities
threshold = 0.75

# Get topic names and prediction probabilities for test data
topic_names = LE.inverse_transform(np.arange(len(lgb_model.classes_)))
probabilities = lgb_model.predict_proba(x)
result_df = pd.DataFrame(probabilities, columns=topic_names)

# Apply thresholding to predicted classes
predicted_classes = [topic_names[i] for i in np.argmax(probabilities, axis=1)]
filtered_classes = [cls if prob > threshold else 'NAN' for cls, prob in zip(predicted_classes, np.max(probabilities, axis=1))]

# Add predicted classes and result probabilities to DataFrame
df["predictedtopic"] = filtered_classes
df2 = df.copy()
df2 = df2[df2["predictedtopic"] != "NAN"]

# Calculate the number of records in df2
num_records_df2 = len(df2)
print("Number of records in df2:", num_records_df2)

# Calculate accuracy for filtered data
train_acc = acc(LE.transform(df2["predictedtopic"]), df2['Topic'])
print("Accuracy:", train_acc)

# Concatenate prediction probabilities with the original DataFrame
df = pd.concat([df, result_df], axis=1)

# Inverse transform the 'Topic' column back to its original values
df["Topic"] = LE.inverse_transform(df["Topic"])

# Apply thresholding to prediction probabilities and add as a new column
filtered_prob = [prob if prob > threshold else "NAN" for prob in np.max(probabilities, axis=1)]
df["probability"] = filtered_prob

# Inverse transform the encoded labels back to their original values

# Export the final DataFrame to an Excel file
df.to_excel('final_results.xlsx', index=False)

## Code 3

In [None]:
import pandas as pd
import numpy as np
import re
import unicodedata
import string
import pickle
from sklearn.preprocessing import LabelEncoder
from transformers import pipeline
import gc

Load sentiment analysis model
pipe = pipeline(model="siebert/sentiment-roberta-large-english")

def encoding(text):
text = unicodedata.normalize('NFKD', text)
return text

def remove_non_ascii(text):
return re.sub(r'[^\x00-\x7f]', r'', text)

def remove_punct(text):
return text.translate(str.maketrans('', '', string.punctuation))

def truncate_text(text, max_length):
# Truncate text to fit within max_length
if len(text) <= max_length:
return text
else:
return text[:max_length-3] + '...'

def predict_sentiment(text, results):
# Clean the text
cleaned_text = remove_punct(remove_non_ascii(encoding(text))).lower()
# Truncate text to fit within the model's maximum sequence length
cleaned_text = truncate_text(cleaned_text, 512)
# Predict sentiment
sentiment = pipe(cleaned_text)
label = sentiment[0]['label']
scores = sentiment[0]['score'] # Fix here, change 'results' to 'sentiment'
return label, scores

def process_row(row, results):
label, scores = predict_sentiment(row['Meeting Notes'], results)
row['sentiment_label'] = label
row['sentiment_scores'] = scores
return row

Read the Excel file into a DataFrame
df = pd.read_excel("FinaldataCRNEW.xlsx")

print(df.shape)
print(df.head())

Drop rows with missing 'Meeting Notes' values
df.dropna(subset=['Meeting Notes'], inplace=True)
df.reset_index(inplace=True, drop=True)

print(df.shape)

Process one row at a time
results = []
for index, row in df.iterrows():
processed_row = process_row(row, results)
results.append(processed_row)
# Clear memory
del processed_row
gc.collect()

Create a new DataFrame from the processed rows
df_processed = pd.DataFrame(results)

print(df_processed)
df_processed.to_csv('Totalnew.csv', index=False)

## Code 2 with functions

In [22]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import joblib
import lightgbm as lgb
from sklearn.metrics import accuracy_score as acc

def preprocess_text(text):
    text = re.sub(r"https?://\S+|www.\S+", "", text)
    text = re.sub(r'[^\x00-\x7f]', r'', text)
    html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
    return re.sub(html, "", text)

def load_test_data(file_path):
    df = pd.read_xlsv(file_path)
    df['Meeting Notes'] = df['Meeting Notes'].astype(str)
    df = df[['Meeting Notes', "Topic"]]
    return df

def apply_model_and_threshold(model, df, threshold):
    x = df["Meeting Notes"].apply(preprocess_text)
    probabilities = model.predict_proba(x)
    le = LE.fit(df["Topic"])
    topic_names = le.inverse_transform(np.arange(len(model.classes_)))
    predicted_classes = [topic_names[i] for i in np.argmax(probabilities, axis=1)]
    filtered_classes = [cls if prob > threshold else 'NAN' for cls, prob in zip(predicted_classes, np.max(probabilities, axis=1))]
    result_df = pd.DataFrame(probabilities, columns=topic_names)
    df["predictedtopic"] = filtered_classes
    df2 = df[df["predictedtopic"] != "NAN"]
    num_records_df2 = len(df2)
    print(type(df2["predictedtopic"]))
    print(type(df2["Topic"]))
    train_acc = acc(le.transform(df2["predictedtopic"]), le.transform(df2['Topic']))
    df = pd.concat([df, result_df], axis=1)
    return df, num_records_df2, train_acc

def save_final_results(df, model, threshold):
    probabilities = model.predict_proba(df["Meeting Notes"].apply(preprocess_text))
    filtered_prob = [prob if prob > threshold else "NAN" for prob in np.max(probabilities, axis=1)]
    df["probability"] = filtered_prob
    df.to_excel('final_results.xlsx', index=False)

# Initialize LabelEncoder
LE = LabelEncoder()

# Load test data
test_data = load_test_data("path")

# Load pre-trained model
loaded_model = joblib.load("model.pkl")

# Set threshold for prediction probabilities
prediction_threshold = 0.75

# Apply model and threshold, calculate results
processed_data, num_records, accuracy = apply_model_and_threshold(loaded_model, test_data, prediction_threshold)
print("Number of records in df2:", num_records)
print("Accuracy:", accuracy)

# Save final results to Excel file
save_final_results(processed_data, loaded_model, prediction_threshold)


<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
Number of records in df2: 6542
Accuracy: 0.5715377560379089
                                           Meeting Notes    HIERARCHY_cl  \
0                                                    nan      FRONT LINE   
1      Proactive reporting system is the basic tool e...  Administrative   
2      Implement better channels of communication, en...  Administrative   
3                              You must follow as it now      FRONT LINE   
4                                                    nan      FRONT LINE   
...                                                  ...             ...   
15602  Make a work table with all air carriers of the...      FRONT LINE   
15603                                                nan      FRONT LINE   
15604  To emphasize the step-by-step of reports of se...      FRONT LINE   
15605                                                nan      FRONT LINE   
15606  Maintain processes curren