In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset with proper tab-separated values
df = pd.read_csv('/kaggle/input/conflict2/conflict.tsv', delimiter='\t')

# Assuming you have a 'text' column for input and a 'label' column for output
# Replace 'text' and 'label' with your actual column names
X = df['text']
y = df['sentiment']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use TfidfVectorizer to convert text data into numerical features
vectorizer = TfidfVectorizer(max_features=9000)  # You can adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize and train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.78
Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.71      0.97      0.82        37
cautiously optimistic       0.83      0.59      0.69        17
             negative       1.00      0.69      0.81        16
              neutral       1.00      0.44      0.62         9
             positive       0.77      0.81      0.79        21

             accuracy                           0.78       100
            macro avg       0.86      0.70      0.75       100
         weighted avg       0.82      0.78      0.77       100



In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset with proper tab-separated values
df = pd.read_csv('/kaggle/input/conflict2/conflict.tsv', delimiter='\t')

# Assuming you have a 'text' column for input and a 'label' column for output
# Replace 'text' and 'label' with your actual column names
X = df['text']
y = df['sentiment']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use TfidfVectorizer to convert text data into numerical features
vectorizer = TfidfVectorizer(max_features=9000)  # You can adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize and train the Support Vector Machine (SVM) model
svm_model = SVC(kernel='linear')  # You can try different kernels, e.g., 'linear', 'rbf', etc.
svm_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred_svm = svm_model.predict(X_test_tfidf)

# Evaluate the SVM model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm:.2f}")

# Display SVM classification report
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))


SVM Accuracy: 0.87
SVM Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.77      1.00      0.87        37
cautiously optimistic       0.88      0.88      0.88        17
             negative       1.00      0.69      0.81        16
              neutral       1.00      0.78      0.88         9
             positive       1.00      0.81      0.89        21

             accuracy                           0.87       100
            macro avg       0.93      0.83      0.87       100
         weighted avg       0.90      0.87      0.87       100



In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset with proper tab-separated values
df = pd.read_csv('/kaggle/input/conflict2/conflict.tsv', delimiter='\t')

# Assuming you have a 'text' column for input and a 'label' column for output
# Replace 'text' and 'label' with your actual column names
X = df['text']
y = df['sentiment']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use TfidfVectorizer to convert text data into numerical features
vectorizer = TfidfVectorizer(max_features=9000)  # You can adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize and train different classifiers
classifiers = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(kernel='linear')
}

for clf_name, clf in classifiers.items():
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} Accuracy: {accuracy:.2f}")

    # Display classification report
    print(f"{clf_name} Classification Report:\n", classification_report(y_test, y_pred))
    print("=" * 50)


Multinomial Naive Bayes Accuracy: 0.78
Multinomial Naive Bayes Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.71      0.97      0.82        37
cautiously optimistic       0.83      0.59      0.69        17
             negative       1.00      0.69      0.81        16
              neutral       1.00      0.44      0.62         9
             positive       0.77      0.81      0.79        21

             accuracy                           0.78       100
            macro avg       0.86      0.70      0.75       100
         weighted avg       0.82      0.78      0.77       100

Logistic Regression Accuracy: 0.81
Logistic Regression Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.70      1.00      0.82        37
cautiously optimistic       0.86      0.71      0.77        17
             negative       1.00      0.56      0.72        16
           

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset with proper tab-separated values
df = pd.read_csv('/kaggle/input/conflict2/conflict.tsv', delimiter='\t')

# Assuming you have a 'text' column for input and a 'label' column for output
# Replace 'text' and 'label' with your actual column names
X = df['text']
y = df['sentiment']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use TfidfVectorizer to convert text data into numerical features
vectorizer = TfidfVectorizer(max_features=9000)  # You can adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize and train different classifiers
classifiers = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(kernel='linear'),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging Classifier': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier()
}

for clf_name, clf in classifiers.items():
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} Accuracy: {accuracy:.2f}")

    # Display classification report
    print(f"{clf_name} Classification Report:\n", classification_report(y_test, y_pred))
    print("=" * 50)


Multinomial Naive Bayes Accuracy: 0.78
Multinomial Naive Bayes Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.71      0.97      0.82        37
cautiously optimistic       0.83      0.59      0.69        17
             negative       1.00      0.69      0.81        16
              neutral       1.00      0.44      0.62         9
             positive       0.77      0.81      0.79        21

             accuracy                           0.78       100
            macro avg       0.86      0.70      0.75       100
         weighted avg       0.82      0.78      0.77       100

Logistic Regression Accuracy: 0.81
Logistic Regression Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.70      1.00      0.82        37
cautiously optimistic       0.86      0.71      0.77        17
             negative       1.00      0.56      0.72        16
           

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset with proper tab-separated values
df = pd.read_csv('/kaggle/input/conflict2/conflict.tsv', delimiter='\t')

# Assuming you have a 'text' column for input and a 'label' column for output
# Replace 'text' and 'label' with your actual column names
X = df['text']
y = df['sentiment']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

# Use TfidfVectorizer to convert text data into numerical features
vectorizer = TfidfVectorizer(max_features=9000)  # You can adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize and train different classifiers
classifiers = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(kernel='linear'),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging Classifier': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier()
}

for clf_name, clf in classifiers.items():
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} Accuracy: {accuracy:.2f}")

    # Display classification report
    print(f"{clf_name} Classification Report:\n", classification_report(y_test, y_pred))
    print("=" * 50)


Multinomial Naive Bayes Accuracy: 0.85
Multinomial Naive Bayes Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.72      1.00      0.84        26
cautiously optimistic       0.94      0.84      0.89        19
             negative       0.95      0.90      0.92        20
              neutral       1.00      0.50      0.67        12
             positive       0.86      0.83      0.84        23

             accuracy                           0.85       100
            macro avg       0.89      0.81      0.83       100
         weighted avg       0.87      0.85      0.85       100

Logistic Regression Accuracy: 0.89
Logistic Regression Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.74      1.00      0.85        26
cautiously optimistic       0.94      0.84      0.89        19
             negative       0.95      0.90      0.92        20
           

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Extra Trees Accuracy: 0.89
Extra Trees Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.77      0.92      0.84        26
cautiously optimistic       0.94      0.84      0.89        19
             negative       0.95      0.95      0.95        20
              neutral       0.83      0.83      0.83        12
             positive       1.00      0.87      0.93        23

             accuracy                           0.89       100
            macro avg       0.90      0.88      0.89       100
         weighted avg       0.90      0.89      0.89       100



In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset with proper tab-separated values
df = pd.read_csv('/kaggle/input/conflict2/conflict.tsv', delimiter='\t')

# Assuming you have a 'text' column for input and a 'label' column for output
# Replace 'text' and 'label' with your actual column names
X = df['text']
y = df['sentiment']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

# Use CountVectorizer to convert text data into numerical features
vectorizer = CountVectorizer(max_features=9000)  # You can adjust max_features as needed
X_train_count = vectorizer.fit_transform(X_train)
X_test_count = vectorizer.transform(X_test)

# Initialize and train different classifiers
classifiers = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(kernel='linear'),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging Classifier': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier()
}

for clf_name, clf in classifiers.items():
    clf.fit(X_train_count, y_train)
    y_pred = clf.predict(X_test_count)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} Accuracy: {accuracy:.2f}")

    # Display classification report
    print(f"{clf_name} Classification Report:\n", classification_report(y_test, y_pred))
    print("=" * 50)


Multinomial Naive Bayes Accuracy: 0.88
Multinomial Naive Bayes Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.80      0.92      0.86        26
cautiously optimistic       0.94      0.84      0.89        19
             negative       0.83      0.95      0.88        20
              neutral       1.00      0.83      0.91        12
             positive       0.95      0.83      0.88        23

             accuracy                           0.88       100
            macro avg       0.90      0.87      0.88       100
         weighted avg       0.89      0.88      0.88       100

Logistic Regression Accuracy: 0.89
Logistic Regression Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.78      0.96      0.86        26
cautiously optimistic       0.94      0.84      0.89        19
             negative       0.90      0.90      0.90        20
           

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize

# Load the dataset with proper tab-separated values
df = pd.read_csv('/kaggle/input/conflict2/conflict.tsv', delimiter='\t')

# Assuming you have a 'text' column for input and a 'label' column for output
# Replace 'text' and 'label' with your actual column names
X = df['text']
y = df['sentiment']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text data
tokenized_X_train = [word_tokenize(text) for text in X_train]
tokenized_X_test = [word_tokenize(text) for text in X_test]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_X_train, vector_size=100, window=5, min_count=1, workers=4)

# Function to create document vectors using Word2Vec model
def create_doc_vectors(tokenized_text, model):
    vectors = [model.wv[word] for word in tokenized_text if word in model.wv]
    return sum(vectors) / len(vectors) if vectors else [0] * model.vector_size

# Create document vectors for training and testing sets
X_train_w2v = [create_doc_vectors(tokens, word2vec_model) for tokens in tokenized_X_train]
X_test_w2v = [create_doc_vectors(tokens, word2vec_model) for tokens in tokenized_X_test]

# Convert to numpy arrays
X_train_w2v = pd.DataFrame(X_train_w2v).to_numpy()
X_test_w2v = pd.DataFrame(X_test_w2v).to_numpy()

# Initialize and train different classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(kernel='linear'),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging Classifier': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier()
}

for clf_name, clf in classifiers.items():
    clf.fit(X_train_w2v, y_train)
    y_pred = clf.predict(X_test_w2v)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} Accuracy: {accuracy:.2f}")

    # Display classification report
    print(f"{clf_name} Classification Report:\n", classification_report(y_test, y_pred))
    print("=" * 50)


Logistic Regression Accuracy: 0.33
Logistic Regression Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.37      0.84      0.52        37
cautiously optimistic       0.00      0.00      0.00        17
             negative       0.00      0.00      0.00        16
              neutral       0.00      0.00      0.00         9
             positive       0.12      0.10      0.11        21

             accuracy                           0.33       100
            macro avg       0.10      0.19      0.12       100
         weighted avg       0.16      0.33      0.21       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest Accuracy: 0.67
Random Forest Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.67      0.81      0.73        37
cautiously optimistic       0.82      0.53      0.64        17
             negative       0.73      0.50      0.59        16
              neutral       0.40      0.67      0.50         9
             positive       0.78      0.67      0.72        21

             accuracy                           0.67       100
            macro avg       0.68      0.63      0.64       100
         weighted avg       0.70      0.67      0.67       100

Gradient Boosting Accuracy: 0.66
Gradient Boosting Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.74      0.76      0.75        37
cautiously optimistic       0.55      0.65      0.59        17
             negative       0.64      0.44      0.52        16
              neutral       0.45   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


AdaBoost Accuracy: 0.38
AdaBoost Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.45      0.54      0.49        37
cautiously optimistic       0.26      0.35      0.30        17
             negative       0.36      0.31      0.33        16
              neutral       0.20      0.22      0.21         9
             positive       0.56      0.24      0.33        21

             accuracy                           0.38       100
            macro avg       0.37      0.33      0.33       100
         weighted avg       0.40      0.38      0.38       100

Bagging Classifier Accuracy: 0.63
Bagging Classifier Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.63      0.73      0.68        37
cautiously optimistic       0.56      0.59      0.57        17
             negative       0.73      0.50      0.59        16
              neutral       0.50      0.56 

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
import tensorflow as tf
import tensorflow_hub as hub

# Load the dataset with proper tab-separated values
df = pd.read_csv('/kaggle/input/conflict2/conflict.tsv', delimiter='\t')

# Assuming you have a 'text' column for input and a 'label' column for output
# Replace 'text' and 'label' with your actual column names
X = df['text']
y = df['sentiment']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load Universal Sentence Encoder
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Create embeddings for training and testing sets
X_train_use = embed(X_train)
X_test_use = embed(X_test)

# Convert TensorFlow tensors to NumPy arrays
X_train_use = X_train_use.numpy()
X_test_use = X_test_use.numpy()

# Initialize and train different classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(kernel='linear'),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging Classifier': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier()
}

for clf_name, clf in classifiers.items():
    clf.fit(X_train_use, y_train)
    y_pred = clf.predict(X_test_use)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} Accuracy: {accuracy:.2f}")

    # Display classification report
    print(f"{clf_name} Classification Report:\n", classification_report(y_test, y_pred))
    print("=" * 50)


Logistic Regression Accuracy: 0.79
Logistic Regression Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.71      0.86      0.78        37
cautiously optimistic       0.71      0.71      0.71        17
             negative       0.91      0.62      0.74        16
              neutral       1.00      0.78      0.88         9
             positive       0.90      0.86      0.88        21

             accuracy                           0.79       100
            macro avg       0.85      0.77      0.80       100
         weighted avg       0.81      0.79      0.79       100

Random Forest Accuracy: 0.77
Random Forest Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.72      0.89      0.80        37
cautiously optimistic       0.73      0.65      0.69        17
             negative       0.83      0.62      0.71        16
              neutral       1.0

In [19]:
!pip install transformers


[0m

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from transformers import BertTokenizer, BertModel
import torch

# Load the dataset with proper tab-separated values
df = pd.read_csv('/kaggle/input/conflict2/conflict.tsv', delimiter='\t')

# Assuming you have a 'text' column for input and a 'label' column for output
# Replace 'text' and 'label' with your actual column names
X = df['text']
y = df['sentiment']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize and convert text to BERT embeddings for training set
X_train_bert = [tokenizer(text, return_tensors='pt', truncation=True, padding=True) for text in X_train]
X_train_bert_embeddings = torch.stack([model(**tokens)['last_hidden_state'].mean(dim=1).squeeze() for tokens in X_train_bert])

# Tokenize and convert text to BERT embeddings for testing set
X_test_bert = [tokenizer(text, return_tensors='pt', truncation=True, padding=True) for text in X_test]
X_test_bert_embeddings = torch.stack([model(**tokens)['last_hidden_state'].mean(dim=1).squeeze() for tokens in X_test_bert])

# Convert to NumPy arrays
X_train_bert_embeddings = X_train_bert_embeddings.detach().numpy()
X_test_bert_embeddings = X_test_bert_embeddings.detach().numpy()

# Initialize and train different classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(kernel='linear'),
}

for clf_name, clf in classifiers.items():
    clf.fit(X_train_bert_embeddings, y_train)
    y_pred = clf.predict(X_test_bert_embeddings)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} Accuracy: {accuracy:.2f}")

    # Display classification report
    print(f"{clf_name} Classification Report:\n", classification_report(y_test, y_pred))
    print("=" * 50)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Plea

Logistic Regression Accuracy: 0.85
Logistic Regression Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.81      0.92      0.86        37
cautiously optimistic       0.82      0.82      0.82        17
             negative       0.92      0.75      0.83        16
              neutral       0.80      0.89      0.84         9
             positive       0.94      0.81      0.87        21

             accuracy                           0.85       100
            macro avg       0.86      0.84      0.85       100
         weighted avg       0.86      0.85      0.85       100

Random Forest Accuracy: 0.82
Random Forest Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.73      0.95      0.82        37
cautiously optimistic       0.76      0.76      0.76        17
             negative       1.00      0.62      0.77        16
              neutral       1.0

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from transformers import BertTokenizer, BertModel
import torch

# Load the dataset with proper tab-separated values
df = pd.read_csv('/kaggle/input/conflict2/conflict.tsv', delimiter='\t')

# Assuming you have a 'text' column for input and a 'label' column for output
# Replace 'text' and 'label' with your actual column names
X = df['text']
y = df['sentiment']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize and convert text to BERT embeddings for training set
X_train_bert = [tokenizer(text, return_tensors='pt', truncation=True, padding=True) for text in X_train]
X_train_bert_embeddings = torch.stack([model(**tokens)['last_hidden_state'].mean(dim=1).squeeze() for tokens in X_train_bert])

# Tokenize and convert text to BERT embeddings for testing set
X_test_bert = [tokenizer(text, return_tensors='pt', truncation=True, padding=True) for text in X_test]
X_test_bert_embeddings = torch.stack([model(**tokens)['last_hidden_state'].mean(dim=1).squeeze() for tokens in X_test_bert])

# Convert to NumPy arrays
X_train_bert_embeddings = X_train_bert_embeddings.detach().numpy()
X_test_bert_embeddings = X_test_bert_embeddings.detach().numpy()

# Initialize and train different classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(kernel='linear'),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging Classifier': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
}

for clf_name, clf in classifiers.items():
    clf.fit(X_train_bert_embeddings, y_train)
    y_pred = clf.predict(X_test_bert_embeddings)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} Accuracy: {accuracy:.2f}")

    # Display classification report
    print(f"{clf_name} Classification Report:\n", classification_report(y_test, y_pred))
    print("=" * 50)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Plea

Logistic Regression Accuracy: 0.85
Logistic Regression Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.81      0.92      0.86        37
cautiously optimistic       0.82      0.82      0.82        17
             negative       0.92      0.75      0.83        16
              neutral       0.80      0.89      0.84         9
             positive       0.94      0.81      0.87        21

             accuracy                           0.85       100
            macro avg       0.86      0.84      0.85       100
         weighted avg       0.86      0.85      0.85       100

Random Forest Accuracy: 0.80
Random Forest Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.69      0.92      0.79        37
cautiously optimistic       0.81      0.76      0.79        17
             negative       1.00      0.56      0.72        16
              neutral       1.0

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4], got ['ambivalent' 'cautiously optimistic' 'negative' 'neutral' 'positive']

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Load the dataset with proper tab-separated values
df = pd.read_csv('/kaggle/input/conflict2/conflict.tsv', delimiter='\t')

# Assuming you have a 'text' column for input and a 'label' column for output
# Replace 'text' and 'label' with your actual column names
X = df['text']
y = df['sentiment']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text data
tokenized_X_train = [word_tokenize(text.lower()) for text in X_train]
tokenized_X_test = [word_tokenize(text.lower()) for text in X_test]

# Create TaggedDocuments for training
tagged_data_train = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(tokenized_X_train)]

# Create Doc2Vec model
doc2vec_model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=20)
doc2vec_model.build_vocab(tagged_data_train)
doc2vec_model.train(tagged_data_train, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Transform documents to vectors for training set
X_train_doc2vec = [doc2vec_model.infer_vector(tokens) for tokens in tokenized_X_train]

# Transform documents to vectors for testing set
X_test_doc2vec = [doc2vec_model.infer_vector(tokens) for tokens in tokenized_X_test]

# Convert to NumPy arrays
X_train_doc2vec = pd.DataFrame(X_train_doc2vec).to_numpy()
X_test_doc2vec = pd.DataFrame(X_test_doc2vec).to_numpy()

# Initialize and train different classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(kernel='linear'),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging Classifier': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
}

for clf_name, clf in classifiers.items():
    clf.fit(X_train_doc2vec, y_train)
    y_pred = clf.predict(X_test_doc2vec)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} Accuracy: {accuracy:.2f}")

    # Display classification report
    print(f"{clf_name} Classification Report:\n", classification_report(y_test, y_pred))
    print("=" * 50)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Logistic Regression Accuracy: 0.31
Logistic Regression Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.35      0.62      0.45        37
cautiously optimistic       0.00      0.00      0.00        17
             negative       0.00      0.00      0.00        16
              neutral       0.00      0.00      0.00         9
             positive       0.23      0.38      0.29        21

             accuracy                           0.31       100
            macro avg       0.12      0.20      0.15       100
         weighted avg       0.18      0.31      0.23       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest Accuracy: 0.56
Random Forest Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.48      0.73      0.58        37
cautiously optimistic       0.43      0.35      0.39        17
             negative       0.83      0.31      0.45        16
              neutral       0.20      0.11      0.14         9
             positive       0.89      0.81      0.85        21

             accuracy                           0.56       100
            macro avg       0.57      0.46      0.48       100
         weighted avg       0.59      0.56      0.54       100

Gradient Boosting Accuracy: 0.56
Gradient Boosting Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.55      0.65      0.59        37
cautiously optimistic       0.54      0.41      0.47        17
             negative       0.47      0.50      0.48        16
              neutral       0.38   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


AdaBoost Accuracy: 0.45
AdaBoost Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.49      0.59      0.54        37
cautiously optimistic       0.30      0.35      0.32        17
             negative       0.56      0.31      0.40        16
              neutral       0.23      0.33      0.27         9
             positive       0.69      0.43      0.53        21

             accuracy                           0.45       100
            macro avg       0.45      0.40      0.41       100
         weighted avg       0.49      0.45      0.45       100

Bagging Classifier Accuracy: 0.48
Bagging Classifier Classification Report:
                        precision    recall  f1-score   support

           ambivalent       0.52      0.59      0.56        37
cautiously optimistic       0.42      0.59      0.49        17
             negative       0.42      0.31      0.36        16
              neutral       0.12      0.11 

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from gensim import corpora
from gensim.models import LdaModel
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Load the dataset with proper tab-separated values
df = pd.read_csv('/kaggle/input/conflict2/conflict.tsv', delimiter='\t')

# Assuming you have a 'text' column for input and a 'label' column for output
# Replace 'text' and 'label' with your actual column names
X = df['text']
y = df['sentiment']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text data
tokenized_X_train = [word_tokenize(text.lower()) for text in X_train]
tokenized_X_test = [word_tokenize(text.lower()) for text in X_test]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(tokenized_X_train)

# Convert the tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_X_train]

# Train LDA model
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10)

# Transform documents to topic distributions for training set
X_train_lda = [lda_model[dictionary.doc2bow(tokens)] for tokens in tokenized_X_train]
X_train_lda = [[topic[1] for topic in doc] for doc in X_train_lda]

# Transform documents to topic distributions for testing set
X_test_lda = [lda_model[dictionary.doc2bow(tokens)] for tokens in tokenized_X_test]
X_test_lda = [[topic[1] for topic in doc] for doc in X_test_lda]

# Convert to NumPy arrays
X_train_lda = pd.DataFrame(X_train_lda).to_numpy()
X_test_lda = pd.DataFrame(X_test_lda).to_numpy()

# Initialize and train different classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(kernel='linear'),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging Classifier': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier()
}

for clf_name, clf in classifiers.items():
    clf.fit(X_train_lda, y_train)
    y_pred = clf.predict(X_test_lda)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} Accuracy: {accuracy:.2f}")

    # Display classification report
    print(f"{clf_name} Classification Report:\n", classification_report(y_test, y_pred))
    print("=" * 50)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values