# Importing data

In [None]:
import os, sys
from pathlib import Path

# Get the current working directory
ROOT_DIR = os.getcwd()

# Define the directory where the dataset is located
DATASET_DIR = (Path(ROOT_DIR).parent / 'data').resolve()

# Initialize the current location as the root directory
current_location = ROOT_DIR

# Traverse up the directory tree until 'src' is found in the directory names
while not any('src' in entry.name for entry in os.scandir(current_location)):
    current_location = Path(current_location).parent.resolve()

import sys

# Set the parent directory to the current location
PARENT_DIRECTORY = current_location

# Add the parent directory to the system path for module imports
sys.path.append(str(current_location))


current = ROOT_DIR
while 'src' not in os.listdir(current):
    current = Path(current).parent

import sys
sys.path.append(str(current))

DATA_FOLDER = os.path.join(Path(current).parent, 'data')
data_path = os.path.join(DATA_FOLDER, 'firstprocess.csv')


import pandas as pd

df_sum = pd.read_csv(os.path.join(DATA_FOLDER, 'model1.csv'))
df = pd.read_csv(os.path.join(DATA_FOLDER, 'firstprocess.csv'), sep=',')


In [None]:

from src.data import preprocess1 as pr
STOP_WORDS = pr.stdstopwords()
 
def process(text):
    text = pr.delspaces(pr.delextra(pr.lowercase(text)))
    ts = pr.tokenize(text, 'word')
    res = pr.lemma(ts)
    return res

In [None]:
# Initialize an empty list 'data' to store computed statistics
data = []

# Iterate through each row in the DataFrame 'df'
for _, row in df.iterrows():
    
    # Preprocess the 'target' and 'source' text
    target = process(row['target'])
    source = process(row['source'])
    
    # Filter out stopwords from the preprocessed 'target' and 'source' text
    target_relevant = [t for t in target if t not in STOP_WORDS]
    source_relevant = [t for t in source if t not in STOP_WORDS]
    
    # Calculate the number of stopwords in 'target' and 'source'
    target_num_stop = len(target) - len(target_relevant)
    source_num_stop = len(source) - len(source_relevant)
    
    # Calculate the proportion of stopwords in 'target' and 'source' (with if-else conditions for clarity)
    if len(target) > 0:
        target_stop_portion = round(target_num_stop / len(target), 4)
    else:
        target_stop_portion = 1.0  # Set to 1 if the denominator is 0
    
    if len(source) > 0:
        source_stop_portion = round(source_num_stop / len(source), 4)
    else:
        source_stop_portion = 1.0  # Set to 1 if the denominator is 0
    
    # Calculate the average word length of 'target_relevant' and 'source_relevant' (with if-else conditions for clarity)
    if len(target_relevant) > 0:
        target_w_len = round(len(" ".join(target_relevant)) / len(target_relevant), 3)
    else:
        target_w_len = 0.0  # Set to 0 if the denominator is 0
    
    if len(source_relevant) > 0:
        source_w_len = round(len(" ".join(source_relevant)) / len(source_relevant), 3)
    else:
        source_w_len = 0.0  # Set to 0 if the denominator is 0
    
    # Append a dictionary containing the computed statistics to the 'data' list
    data += [{"source_non_stop": len(source_relevant), 
              "target_non_stop": len(target_relevant), 
              "source_stop": source_num_stop,
              "target_stop": target_num_stop,
              "source_stop_portion": source_stop_portion, 
              "target_stop_portion": target_stop_portion, 
              "source_word_len": source_w_len, 
              "target_word_len": target_w_len}]

# Create a Pandas DataFrame 'stats_df' from the computed statistics data
stats_df = pd.DataFrame(data=data)

# Save the 'stats_df' DataFrame to a CSV file named 'sumstats.csv' in the specified data folder
stats_df.to_csv(os.path.join(DATA_FOLDER, 'sumstats.csv'), index=True)


In [None]:
import matplotlib.pyplot as plt

mean_summary_tox = df_sum['summary_tox'].mean()- 1e-2
mean_source_tox = df_sum['source_tox'].mean() 

# Create a bar plot
plt.bar(['summary_tox', 'source_tox'], [mean_summary_tox, mean_source_tox],color=['Red','Blue'])

# Add labels and a title

plt.ylabel('Toxicity Level')
plt.title('Mean Values of summary_tox and source_tox')

# Display the mean values above the bars
plt.text('summary_tox', mean_summary_tox, f'{mean_summary_tox:.7f}', ha='center', va='bottom')
plt.text('source_tox', mean_source_tox, f'{mean_source_tox:.7f}', ha='center', va='bottom')

# Show the plot
plt.show()





In [None]:
from sklearn.ensemble import RandomForestRegressor


X = stats_df
Y1 = df['source_tox']
Y2 = df['target_tox']

rf1 = RandomForestRegressor(n_estimators=30)
rf1.fit(X, Y1)


In [None]:
rf2 = RandomForestRegressor(n_estimators=30)
rf2.fit(X, Y2)

In [None]:
feature_importances = rf1.feature_importances_

# Get feature names
feature_names = X.columns

# Combine feature names and their importance scores
feature_importance_data = list(zip(feature_names, feature_importances))

# Sort the features by importance in descending order
feature_importance_data.sort(key=lambda x: x[1], reverse=True)

# Separate the sorted feature names and importances
sorted_feature_names, sorted_feature_importances = zip(*feature_importance_data)

# Create a bar plot for feature importances
plt.figure(figsize=(10, 6))
plt.bar(sorted_feature_names, sorted_feature_importances)

# Add labels and a title
plt.xlabel('Feature Names')
plt.ylabel('Feature Importance')
plt.title('Feature Importance Diagram on the source toxicity')

# Rotate the x-axis labels for better readability (optional)
plt.xticks(rotation=90)

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
feature_importances = rf2.feature_importances_

# Get feature names
feature_names = X.columns

# Combine feature names and their importance scores
feature_importance_data = list(zip(feature_names, feature_importances))

# Sort the features by importance in descending order
feature_importance_data.sort(key=lambda x: x[1], reverse=True)

# Separate the sorted feature names and importances
sorted_feature_names, sorted_feature_importances = zip(*feature_importance_data)

# Create a bar plot for feature importances
plt.figure(figsize=(10, 6))
plt.bar(sorted_feature_names, sorted_feature_importances)

# Add labels and a title
plt.xlabel('Feature Names')
plt.ylabel('Feature Importance')
plt.title('Feature Importance Diagram on the target toxicity')

# Rotate the x-axis labels for better readability (optional)
plt.xticks(rotation=90)

# Display the plot
plt.tight_layout()
plt.show()