In [None]:
import seaborn as sns
import pandas as pd
from tqdm import tqdm
import sklearn
import warnings

import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
import re

nltk.download('sentiwordnet')

import spacy
nlp = spacy.load("en_core_web_sm")

from flair.models import TextClassifier
from flair.data import Sentence

tqdm.pandas()
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("raw_data.csv")

In [None]:
df = df[df['content'].str.len() > 20]

sns.countplot(x='star', data=df)

In [None]:
df = df[df['prepro'].str.len() > 20]
df = df[df['content'].str.len() > 20]

In [None]:
df

In [None]:

df = df.drop_duplicates(subset=["content"])
df_except_neutral = df['star'] != 3
df = df[df_except_neutral]


df['sentiment'] = [1 if i > 3 else 0 for i in df['star']]
# nan 값 제거
df = df.dropna()

df

In [None]:
df_train = df.sample(frac=0.8,random_state=82)
df_test = df.drop(df_train.index)

df_test.to_csv("text.csv", index= False)

print(len(df_test))

In [None]:
print(len(df_train))

In [None]:
sns.set_palette(["#F44336","#4CAF50"])
sns.countplot(x='sentiment', data=df_train)

print("긍정 개수:", (df_train["sentiment"] == 1).sum())
print("부정 개수:", (df_train["sentiment"] == 0).sum())

In [None]:
def equal_ratio(df): #downsampling

 
    pos = df[df['sentiment'] == 1]
    neg = df[df['sentiment'] == 0]
    

    print(f"Before Positive: {len(pos)}, Before Negative: {len(neg)}")


    pos_len, neg_len = len(pos), len(neg)
    DFF = abs(pos_len - neg_len)
    ratio = 1 - (DFF / max(pos_len, neg_len))

    if pos_len != neg_len:
            pos = pos.sample(frac=ratio, random_state=82) if pos_len > neg_len else pos
            neg = neg.sample(frac=ratio, random_state=82) if neg_len > pos_len else neg

 
    print(f"After Postive: {len(pos)}, After Negative: {len(neg)}")

    df = pd.concat([pos, neg], axis=0)
    df = sklearn.utils.shuffle(df, random_state=82).reset_index(drop=True)

    return df

In [None]:
df = df_train
df = equal_ratio(df)
sns.set_palette(["#F44336","#4CAF50"])
sns.countplot(x='sentiment', data=df)

# 비지도 감성분석

In [None]:

from textblob import TextBlob

sent_textblob = []

for text in df['content'].copy():

    blob = TextBlob(text)

    polarity = blob.sentiment.polarity


    if polarity > 0.1:
        sent_textblob.append('Positive')
    elif polarity < 0.1 and polarity != 0:
        sent_textblob.append('Negative')
    else:
        sent_textblob.append('Neutral')

df['sentiment_textblob'] = sent_textblob
print('Textblob')

In [None]:
from afinn import Afinn

sent_afinn = []

afinn = Afinn()


for text in df['content'].copy():

    sentiment_score = afinn.score(text)


    if sentiment_score > 2:
        sent_afinn.append('Positive')
    elif sentiment_score < 2 and sentiment_score !=0:
        sent_afinn.append('Negative')
    else:
        sent_afinn.append('Neutral')

df['sentiment_afinn'] = sent_afinn
print('Afinn')

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sent_vader = []

nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()


for text in df['content'].copy():

    sentiment_scores = analyzer.polarity_scores(text)


    if sentiment_scores['compound'] > 0.1:
        sent_vader.append('Positive')
    elif sentiment_scores['compound'] <= -0.1 and sentiment_scores['compound'] !=0:
        sent_vader.append('Negative')
    else:
        sent_vader.append('Neutral')

df['sentiment_vader'] = sent_vader
print('VADER')

In [None]:
from pattern.en import sentiment

sent_pattern = []

for text in df['content'].copy():


    sentiment_score, confidence = sentiment(text)


    if sentiment_score > 0.1:
        sent_pattern.append('Positive')
    elif sentiment_score < 0.1 and sentiment_score !=0:
        sent_pattern.append('Negative')
    else:
        sent_pattern.append('Neutral')

df['sentiment_pattern'] = sent_pattern
print('Pattern')

In [None]:

classifier = TextClassifier.load('en-sentiment')

sent_flair = []
sentiment_flair = []


for text in df['content'].copy():

    sentence = Sentence(text)


    classifier.predict(sentence)


    sentiment_score = sentence.labels[0].score
    sentiment_value = sentence.labels[0].value


    if sentiment_value == 'POSITIVE':
        sent_flair.append("Positive")
    elif sentiment_value == 'NEGATIVE':
        sent_flair.append("Negative")
    else:
        sent_flair.append('Neutral')


df['sentiment_flair'] = sent_flair

print('Flair')


In [None]:
df_train = df
df_train['sentiment'].value_counts()

In [None]:
df_train.to_csv("unsupx_train.csv", index=False)

In [None]:

mode_values = df_train.mode(axis=1)[0]


mode_counts = df.apply(lambda row: row.value_counts().max(), axis=1)
df_train = df_train[mode_counts != 2]

df_train['sentiment_final'] = df_train.mode(axis=1)[0]

In [None]:
df_train['sentiment_final'].value_counts()

In [None]:
df_train = df_train[['star', 'content', 'at', 'prepro', 'sentiment','sentiment_final']]

In [None]:
df_with_unsup = df_train[(df_train['sentiment_final'] == 'Positive') | (df_train['sentiment_final'] == 'Negative')].copy()
df_with_unsup['sentiment_final'] = [1 if i == "Positive" else 0 for i in df_with_unsup['sentiment_final']]
df_with_unsup = df_with_unsup[df_with_unsup['sentiment'] == df_with_unsup['sentiment_final']]
df_with_unsup['sentiment'].value_counts() == df_with_unsup['sentiment_final'].value_counts()

In [None]:
df_with_unsup['sentiment_final'].value_counts()

In [None]:
sns.set_palette(["#F44336","#4CAF50"])

df_with_unsup = equal_ratio(df_with_unsup)
sns.countplot(x='sentiment', data=df_with_unsup)