In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from wordcloud import WordCloud

# Step 1: Perform PCA for dimensionality reduction
def perform_pca(embeddings, n_components=2):
    pca = PCA(n_components=n_components)
    reduced_embeddings = pca.fit_transform(embeddings)
    explained_variance = np.sum(pca.explained_variance_ratio_)
    print(f"PCA explained variance: {explained_variance:.2%}")
    return reduced_embeddings

# Step 2: Perform t-SNE for dimensionality reduction
def perform_tsne(embeddings, n_components=2, perplexity=30, random_state=42):
    tsne = TSNE(n_components=n_components, perplexity=perplexity, random_state=random_state)
    reduced_embeddings = tsne.fit_transform(embeddings)
    return reduced_embeddings

# Step 3: Visualize embeddings in 2D
def plot_embeddings_2d(embeddings_2d, labels=None, title="2D Embedding Visualization"):
    plt.figure(figsize=(10, 7))
    if labels is not None:
        plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=labels, cmap='viridis', s=50, alpha=0.7)
        plt.colorbar()
    else:
        plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], s=50, alpha=0.7)
    plt.title(title)
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.show()

# Step 4: Perform K-Means Clustering
def perform_kmeans(embeddings, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    return labels, kmeans

# Step 5: Compute Cosine Similarity Matrix
def compute_cosine_similarity(embeddings):
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix

# Step 6: Compute Wordcloud
def generate_wordcloud(tfidf_matrix, vectorizer, max_words=100, title="Word Cloud"):
    """
    Generates a word cloud based on TF-IDF scores.
    """
    # Compute the average TF-IDF score for each term
    tfidf_scores = np.mean(tfidf_matrix, axis=0)
    tfidf_scores_dict = {term: tfidf_scores[idx] for term, idx in vectorizer.vocabulary_.items()}
    
    # Generate the word cloud
    wordcloud = WordCloud(width=800, height=400, background_color="white", max_words=max_words)
    wordcloud.generate_from_frequencies(tfidf_scores_dict)
    
    # Plot the word cloud
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(title, fontsize=16)
    plt.show()


  from pandas.core import (


In [3]:
import os
import pandas as pd
import json
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import numpy as np

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Paths
x_folder_path = r"C:\Users\sabri\OneDrive\桌面\論文\論文code\Air transportation SIC 4512\ML dataset\2013-2023_all (X)"
y_file_path = r"C:\Users\sabri\OneDrive\桌面\論文\論文code\Air transportation SIC 4512\ML dataset\Annual ESG score (Y)\Combined_ESG_Scores.csv"
test_file_path = r"C:\Users\sabri\OneDrive\桌面\論文\論文code\Air transportation SIC 4512\ML dataset\Test data"
esg_file_path = r"C:\Users\sabri\OneDrive\桌面\論文\論文code\Air transportation SIC 4512\ESG wordlist.csv"  # Path to the ESG wordlist


# Load ESG dictionary
esg_dict = pd.read_csv(esg_file_path)
esg_terms = set(esg_dict['terms'].str.lower())  # Convert to lowercase for case-insensitive matching

# Load Y data (Annual ESG scores)
y_data = pd.read_csv(y_file_path)
esg_scores = y_data['ESG'].tolist()  # Assuming the column name is 'ESG'

# Initialize NLP tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    text = text.encode().decode("unicode_escape")  # Decode any Unicode escape sequences
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    
    # Remove stopwords and lemmatize
    tokens = word_tokenize(text)
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word in esg_terms]
    return ' '.join(filtered_tokens)

    
    
# Step 1: Extract, preprocess, and store content from each file in the X folder
documents = []

for filename in os.listdir(x_folder_path):
    if filename.endswith('.json'):
        with open(os.path.join(x_folder_path, filename), 'r') as file:
            data = json.load(file)
            # Extract content (assuming the entire content is relevant)
            content = json.dumps(data)  # Convert JSON to string
            # Preprocess the content
            cleaned_text = preprocess_text(content)
            documents.append(cleaned_text)

# documents

# Step 2: Generate TF-IDF vectors for the documents
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents).toarray()

# Step 3: Train a model to predict scores based on TF-IDF vectors
X = tfidf_matrix
y = esg_scores

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use Linear Regression for prediction
model = LinearRegression()

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')  # 5-fold cross-validation
cv_scores = -cv_scores  # Convert to positive MSE for interpretation

# Print cross-validation results
print("Cross-Validation Mean Squared Errors for each fold:", cv_scores)
print("Average Cross-Validation MSE:", np.mean(cv_scores))


model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Step 4: Define a function for predicting scores on new text input
def predict_score(new_text):
    new_text = preprocess_text(new_text)
    new_tfidf = vectorizer.transform(new_text).toarray()
    predicted_score = model.predict(new_tfidf)
    return predicted_score[0]





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sabri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sabri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sabri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Cross-Validation Mean Squared Errors for each fold: [312.01108056 166.4707854  436.07177085 245.28907707 255.92014289]
Average Cross-Validation MSE: 283.1525713539246
Mean Squared Error: 78.75377610617883


In [5]:
# Step 5: Process and predict ESG scores for test data
new_documents = []

for filename in os.listdir(test_file_path):
    if filename.endswith('.json'):
        with open(os.path.join(test_file_path, filename), 'r') as file:
            data = json.load(file)
            # Extract content (assuming the entire content is relevant)
            content = json.dumps(data)  # Convert JSON to string
            # Preprocess the content
            new_cleaned_text = preprocess_text(content)
            new_documents.append(new_cleaned_text)

# Transform the new documents using the fitted vectorizer
new_tfidf = vectorizer.transform(new_documents).toarray()

# Predict ESG scores for the new files
predicted_scores = model.predict(new_tfidf)

# Print the predicted ESG scores
for idx, score in enumerate(predicted_scores, start=1):
    print(f"File {idx}: Predicted ESG Score: {score}")

File 1: Predicted ESG Score: 93.31725305558422
File 2: Predicted ESG Score: 70.30944193096272
File 3: Predicted ESG Score: 70.5790152910135
File 4: Predicted ESG Score: 55.260703835662866
File 5: Predicted ESG Score: 58.160265538596036
File 6: Predicted ESG Score: 40.775688433261735
File 7: Predicted ESG Score: 55.64681443679913
File 8: Predicted ESG Score: 86.63824984621135


In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

Feature selection with forward stepwise method (include cross validation)

In [16]:
# Linear Regression
model_lr = LinearRegression()

# Forward stepwise feature selection
sfs_lr = SequentialFeatureSelector(model_lr, n_features_to_select='auto', direction='forward', cv=5)
sfs_lr.fit(X, y)

# Transform the dataset to selected features
X_selected_lr = sfs_lr.transform(X)
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X_selected_lr, y, test_size=0.2, random_state=42)

# Train and evaluate the model
model_lr.fit(X_train_lr, y_train_lr)
y_pred_lr = model_lr.predict(X_test_lr)
mse_lr = mean_squared_error(y_test_lr, y_pred_lr)
print("Linear Regression with Forward Stepwise Test MSE:", mse_lr)


Linear Regression with Forward Stepwise Test MSE: 317.67948526010025


In [17]:
# Lasso Regression
model_lasso = Lasso(alpha=0.01)
model_lasso.fit(X, y)

# Transform the dataset to selected features
X_selected_lasso = model_lasso.transform(X)
X_train_lasso, X_test_lasso, y_train_lasso, y_test_lasso = train_test_split(X_selected_lasso, y, test_size=0.2, random_state=42)

# Train and evaluate the model
model_lasso.fit(X_train_lasso, y_train_lasso)
y_pred_lasso = model_lasso.predict(X_test_lasso)
mse_lasso = mean_squared_error(y_test_lasso, y_pred_lasso)
print("Lasso Regression with Forward Stepwise Test MSE:", mse_lasso)


Lasso Regression with Forward Stepwise Test MSE: 39.79698498222867


In [18]:
# Decision Tree
model_tree = DecisionTreeRegressor(max_depth=10, random_state=42)

# Forward stepwise feature selection
sfs_tree = SequentialFeatureSelector(model_tree, n_features_to_select='auto', direction='forward', cv=5)
sfs_tree.fit(X, y)

# Transform the dataset to selected features
X_selected_tree = sfs_tree.transform(X)
X_train_tree, X_test_tree, y_train_tree, y_test_tree = train_test_split(X_selected_tree, y, test_size=0.2, random_state=42)

# Train and evaluate the model
model_tree.fit(X_train_tree, y_train_tree)
y_pred_tree = model_tree.predict(X_test_tree)
mse_tree = mean_squared_error(y_test_tree, y_pred_tree)
print("Decision Tree with Forward Stepwise Test MSE:", mse_tree)


Decision Tree with Forward Stepwise Test MSE: 101.61288628555778


Feature selection with decision tree and fit it in XGboost

In [25]:
# Get indices of selected features
selected_feature_indices = sfs_tree.get_support(indices=True)
print(f"Selected Feature Indices: {selected_feature_indices}")

from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Step 2: Train XGBoost on the selected features
model_xgb = XGBRegressor(n_estimators=50, max_depth=5, learning_rate=0.1, random_state=42, n_jobs=-1)

# Train the XGBoost model
model_xgb.fit(X_train_tree, y_train)

# Predict and evaluate
y_pred_xgb = model_xgb.predict(X_test_tree)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f"XGBoost Test MSE with Selected Features: {mse_xgb}")


Selected Feature Indices: [  0   2   3   7   8  11  16  23  26  28  29  33  34  36  37  40  41  43
  45  48  50  52  53  61  62  65  68  69  71  72  73  78  80  85  88  90
  92  93  94  96  97  98  99 102 105 107 109 113 115 118 119 120 121 123
 124 126 129 131 133 134 135 136 137 138 142 143 145 146 148 151 152 153
 155 157 159 161 163 168 169 170 175 176 177 178 180 181 184 185 186 188
 189 190 191 192 194 195 196 197 198 200 201 202 203 204 207 208 209 211
 212 213 214 216 219 220 222 224 225 227 234 235 236 238 239 240 243 244
 245 246 252 257 258 265 267 270 272 273 275 277 278 279 281 284 285 286
 287 292 294 295 297 300 303 304 307 311 312 314 316 317 318 319]
XGBoost Test MSE with Selected Features: 46.0106124622111


Full XGboost Version

In [26]:
# XGBoost
model_xgb = XGBRegressor(n_estimators=10, max_depth=5, learning_rate=0.1, random_state=42, n_jobs=-1)

# Forward stepwise feature selection
sfs_xgb = SequentialFeatureSelector(model_xgb, n_features_to_select='auto', direction='forward', cv=3)
sfs_xgb.fit(X, y)

# Transform the dataset to selected features
X_selected_xgb = sfs_xgb.transform(X)
X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb = train_test_split(X_selected_xgb, y, test_size=0.2, random_state=42)

# Train and evaluate the model
model_xgb.fit(X_train_xgb, y_train_xgb)
y_pred_xgb = model_xgb.predict(X_test_xgb)
mse_xgb = mean_squared_error(y_test_xgb, y_pred_xgb)
print("XGBoost with Forward Stepwise Test MSE:", mse_xgb)


KeyboardInterrupt: 

Combine version of 2024 ESG score prediction

In [27]:
# Process test data
new_documents = []
for filename in os.listdir(test_file_path):
    if filename.endswith('.json'):
        with open(os.path.join(test_file_path, filename), 'r') as file:
            data = json.load(file)
            content = json.dumps(data)  # Convert JSON to string
            new_cleaned_text = preprocess_text(content)
            new_documents.append(new_cleaned_text)

# Transform the new documents into TF-IDF features
new_tfidf = vectorizer.transform(new_documents).toarray()

# Predict ESG scores using all models
predicted_scores_lasso = model_lasso.predict(new_tfidf)
predicted_scores_tree = model_tree.predict(new_tfidf)
predicted_scores_xgb = model_xgb.predict(new_tfidf)

# Print predictions for each file
for idx, (lasso_score, tree_score, xgb_score) in enumerate(zip(predicted_scores_lasso, predicted_scores_tree, predicted_scores_xgb), start=1):
    print(f"File {idx}: Lasso ESG Score: {lasso_score}, Decision Tree ESG Score: {tree_score}, XGBoost ESG Score: {xgb_score}")


ValueError: X has 320 features, but Lasso is expecting 160 features as input.

1. Linear Regression

In [None]:
# Linear Regression
model_lr = LinearRegression()

# Cross-validation
cv_scores_lr = cross_val_score(model_lr, X, y, cv=5, scoring='neg_mean_squared_error')
cv_scores_lr = -cv_scores_lr  # Convert to positive MSE

print("Linear Regression Cross-Validation MSE (per fold):", cv_scores_lr)
print("Linear Regression Average CV MSE:", np.mean(cv_scores_lr))

# Train and evaluate on test set
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
print("Linear Regression Test Set MSE:", mse_lr)


Linear Regression Cross-Validation MSE (per fold): [312.01108056 166.4707854  436.07177085 245.28907707 255.92014289]
Linear Regression Average CV MSE: 283.1525713539246
Linear Regression Test Set MSE: 78.75377610617883


2. Lasso

In [9]:
from sklearn.linear_model import Lasso

# Lasso Regression
model_lasso = Lasso(alpha=0.01)  # Adjust alpha for regularization strength

# Cross-validation
cv_scores_lasso = cross_val_score(model_lasso, X, y, cv=5, scoring='neg_mean_squared_error')
cv_scores_lasso = -cv_scores_lasso  # Convert to positive MSE

print("Lasso Regression Cross-Validation MSE (per fold):", cv_scores_lasso)
print("Lasso Regression Average CV MSE:", np.mean(cv_scores_lasso))

# Train and evaluate on test set
model_lasso.fit(X_train, y_train)
y_pred_lasso = model_lasso.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
print("Lasso Regression Test Set MSE:", mse_lasso)


Lasso Regression Cross-Validation MSE (per fold): [ 89.39994493 171.13115684 318.78326313 465.52706559 220.06179173]
Lasso Regression Average CV MSE: 252.9806444430663
Lasso Regression Test Set MSE: 39.57109092053446


3. Decision Tree

In [None]:
# Decision Tree Regressor
model_tree = DecisionTreeRegressor(max_depth=10, random_state=42)  # Adjust max_depth as needed

# Cross-validation
cv_scores_tree = cross_val_score(model_tree, X, y, cv=5, scoring='neg_mean_squared_error')
cv_scores_tree = -cv_scores_tree  # Convert to positive MSE

print("Decision Tree Cross-Validation MSE (per fold):", cv_scores_tree)
print("Decision Tree Average CV MSE:", np.mean(cv_scores_tree))

# Train and evaluate on test set
model_tree.fit(X_train, y_train)
y_pred_tree = model_tree.predict(X_test)
mse_tree = mean_squared_error(y_test, y_pred_tree)
print("Decision Tree Test Set MSE:", mse_tree)


Decision Tree Cross-Validation MSE (per fold): [678.54044993 259.97185693 442.68954606 918.5174513  329.10486446]
Decision Tree Average CV MSE: 525.7648337353579
Decision Tree Test Set MSE: 211.3539510200011


4. XGboost

In [None]:
# XGBoost Regressor
model_xgb = XGBRegressor(n_estimators=100, max_depth=10, learning_rate=0.1, random_state=42)

# Cross-validation
cv_scores_xgb = cross_val_score(model_xgb, X, y, cv=5, scoring='neg_mean_squared_error')
cv_scores_xgb = -cv_scores_xgb  # Convert to positive MSE

print("XGBoost Cross-Validation MSE (per fold):", cv_scores_xgb)
print("XGBoost Average CV MSE:", np.mean(cv_scores_xgb))

# Train and evaluate on test set
model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print("XGBoost Test Set MSE:", mse_xgb)


XGBoost Cross-Validation MSE (per fold): [388.68854311 153.13135534 429.64131123 435.88815059 220.24391327]
XGBoost Average CV MSE: 325.5186547075813
XGBoost Test Set MSE: 101.8876814764665


In [12]:
# Initialize an empty list to store processed test documents
new_documents = []

# Preprocess and collect content from each JSON file in the test folder
for filename in os.listdir(test_file_path):
    if filename.endswith('.json'):
        with open(os.path.join(test_file_path, filename), 'r') as file:
            data = json.load(file)
            # Extract content (assuming the entire content is relevant)
            content = json.dumps(data)  # Convert JSON to string
            # Preprocess the content
            new_cleaned_text = preprocess_text(content)
            new_documents.append(new_cleaned_text)

# Transform the new documents using the fitted vectorizer
new_tfidf = vectorizer.transform(new_documents).toarray()

# Use the trained Lasso model to predict ESG scores for the test files
predicted_scores_lasso = model_lasso.predict(new_tfidf)

# Print the predicted ESG scores for each test file
for idx, score in enumerate(predicted_scores_lasso, start=1):
    print(f"File {idx}: Predicted ESG Score (Lasso): {score}")


File 1: Predicted ESG Score (Lasso): 88.68133787127225
File 2: Predicted ESG Score (Lasso): 72.00600568683566
File 3: Predicted ESG Score (Lasso): 67.22411172526519
File 4: Predicted ESG Score (Lasso): 52.9329662211223
File 5: Predicted ESG Score (Lasso): 59.567840079223835
File 6: Predicted ESG Score (Lasso): 48.24706138005496
File 7: Predicted ESG Score (Lasso): 51.57158517704359
File 8: Predicted ESG Score (Lasso): 75.51048016775673
