# 감성분석(Sentiment Analysis)
* author : 한국외국어대학교 경영학전공 202102858 임산별
* 한국외국어대학교 빅데이터 2학기 기말 프로젝트



*   데이터 : car_com,edmunds의 ravo4에 대한 2019~2023년까지의 리뷰


*   코드 참고 : https://github.com/hanbit/blueprints-text?tab=readme-ov-file
*   참고 문헌 : 파이썬 라이브러리를 활용한 텍스트 분석(젠스 알브레히트)


*   과정


1.   데이터 불러오기/setting
2.   전처리
3. 훈련/테스트 분할
4. 토큰화 /백터화
5. 예측







In [4]:
!pip install pandas
import pandas as pd



In [6]:
edmunds = pd.read_csv("data/edmunds.csv")

In [8]:
car_com = pd.read_excel("data/reviews_2010_합친버전.xlsx")

## 1. 데이터 Setting

### car_com 데이터 setting

In [12]:
# 날짜 수정
car_com['date'] = pd.to_datetime(car_com['날짜'])

# 리뷰 제목과 리뷰 내용을 합쳐서 review 열 생성
car_com['review'] = car_com['리뷰 제목'] + " " + car_com['리뷰 내용']


#날짜 오름차순으로 정렬
car_com = car_com.sort_values(by='date')

#2009년도 제거
car_com = car_com[car_com['date'].dt.year != 2009]

# 리뷰가 중복으로 들어가 있음 -> 리뷰 내용이 같은 리뷰들 중복제거
car_com.drop_duplicates(subset='review', inplace=True)

#날짜 행 삭제
car_com.drop(columns=['날짜'], inplace=True)

#평점 숫자형으로 바꾸기
car_com['rating'] = car_com['평점'].astype(float)

In [14]:
#리뷰 제목, 리뷰 내용, 평점 삭제
car_com.drop(columns=['리뷰 제목', '리뷰 내용', '평점'], inplace=True)

In [16]:
car_com.head()

Unnamed: 0,date,review,rating
598,2010-01-01,Great Value - Fun to Drive - First Foreign Car...,5.0
147,2010-01-03,Very good value! This is my first suv and I ha...,5.0
146,2010-01-13,Great compact SUV This is my first SUV. I have...,5.0
145,2010-01-28,Great small SUV After much research looking at...,5.0
144,2010-02-25,Still have faith in Toyota I purchased a 2010 ...,5.0


### edmunds 데이터 setting

In [19]:
edmunds['날짜'] = edmunds['날짜'].str.extract(r'(\d{2}/\d{2}/\d{4})')[0]

edmunds['평점'] = edmunds['평점'].str.extract(r'(\d+)')[0]

edmunds['평점'] = edmunds['평점'].astype(float)

In [21]:
#날짜열 date형태로 변경
edmunds['날짜'] = pd.to_datetime(edmunds['날짜'])

In [23]:
#열 이름 변경 : 날짜:data, 리뷰:review, 평점:rating
edmunds.rename(columns={'날짜':'date', '리뷰':'review', '평점':'rating'}, inplace=True)

In [25]:
edmunds.head()

Unnamed: 0,date,review,rating
0,2023-01-20,I bought this car as my first and it never bro...,5.0
1,2022-09-22,"Dependable, comfortable, no major mechanical i...",5.0
2,2021-05-07,For a small SUV it has a lot going for it. Lot...,4.0
3,2019-08-02,"Front left&right ball joints replaced at 150,0...",4.0
4,2020-03-13,This car has a recall that effectively permane...,1.0


## 2. edmunds, car_com 데이터 합치기

In [28]:
#데이터 합치기
df = pd.concat([edmunds, car_com])

In [30]:
#평점이 3점인 리뷰의 갯수
df[df['rating'] == 3].count()

date      242
review    242
rating    242
dtype: int64

In [32]:
#제품 평가를 대리 측정값으로 활용함
#긍정적 리뷰와 부정적 리뷰를 명확하게 구분하기 위해 평점 3 필터링
df = df[df['rating'] != 3]

In [34]:
# 지도학습 기반 감성분석

# 제품 리뷰를 기반으로 0/1 레이블 정보를 가진 새로운 클래스 레이블 할당하기
df['sentiment'] = 0
df.loc[df['rating'] > 3, 'sentiment'] = 1
df.loc[df['rating'] < 3, 'sentiment'] = 0

In [36]:
df.sample(3)

Unnamed: 0,date,review,rating,sentiment
4260,2017-12-13,Repeat Purchase I finally broke down and bough...,5.0,1
1346,2011-03-22,Great Car I had a Rav4 as a rental & decided t...,5.0,1
46,2010-11-12,"Best cargo space in its class, lots of useful ...",5.0,1


In [38]:
df.count()

date         4019
review       4018
rating       4019
sentiment    4019
dtype: int64

### 텍스트 전처리

In [48]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [50]:
# NLTK 불용어 사전 다운로드
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sanbyeol/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sanbyeol/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [52]:
# 불용어 리스트 준비
stop_words = set(stopwords.words('english'))

In [54]:
def preprocess_text(text):
    """
    텍스트 데이터를 전처리하는 함수.
    """
    # 1. 소문자 변환
    text = text.lower()

    # 2. 특수 문자 및 숫자 제거 (알파벳과 공백만 남김)
    text = re.sub(r"[^a-zA-Z\s]", "", text)

    # 3. 토큰화 (단어 단위로 분리)
    words = word_tokenize(text)

    # 4. 불용어 제거
    words = [word for word in words if word not in stop_words]

    # 5. 단어들을 공백으로 결합하여 다시 텍스트로 반환
    return " ".join(words)

In [58]:
# 문자열이 아닌 데이터 제거
df = df[df['review'].apply(lambda x: isinstance(x, str))]

# 데이터프레임에 전처리 적용 (리뷰 열 기준)
df['cleaned_review'] = df['review'].apply(preprocess_text)


0    bought car first never broke years worked flaw...
1    dependable comfortable major mechanical issues...
2    small suv lot going lots space back nice back ...
3              front leftright ball joints replaced mi
4    car recall effectively permanently disables re...
Name: cleaned_review, dtype: object


In [60]:
df.head(2)

Unnamed: 0,date,review,rating,sentiment,cleaned_review
0,2023-01-20,I bought this car as my first and it never bro...,5.0,1,bought car first never broke years worked flaw...
1,2022-09-22,"Dependable, comfortable, no major mechanical i...",5.0,1,dependable comfortable major mechanical issues...


## 2. Train-Test Split

In [63]:
from sklearn.model_selection import train_test_split

In [65]:
X_train, X_test, Y_train, Y_test = train_test_split(df['review'],
                                                    df['sentiment'],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=df['sentiment'])

In [67]:
print ('Size of Training Data ', X_train.shape[0])
print ('Size of Test Data ', X_test.shape[0])

print ('Distribution of classes in Training Data :')
print ('Positive Sentiment ', str(sum(Y_train == 1)/ len(Y_train) * 100.0))
print ('Negative Sentiment ', str(sum(Y_train == 0)/ len(Y_train) * 100.0))

print ('Distribution of classes in Testing Data :')
print ('Positive Sentiment ', str(sum(Y_test == 1)/ len(Y_test) * 100.0))
print ('Negative Sentiment ', str(sum(Y_test == 0)/ len(Y_test) * 100.0))

Size of Training Data  3214
Size of Test Data  804
Distribution of classes in Training Data :
Positive Sentiment  94.18170504044804
Negative Sentiment  5.81829495955196
Distribution of classes in Testing Data :
Positive Sentiment  94.15422885572139
Negative Sentiment  5.845771144278607


## 3. Text Vextorization

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [73]:
tfidf = TfidfVectorizer(min_df = 10, ngram_range=(1,1))
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)

## 4. Trainning the Machine Learning model

In [76]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [78]:
model1 = LinearSVC(random_state=42, tol=1e-5)
model1.fit(X_train_tf, Y_train)



In [82]:
Y_pred = model1.predict(X_test_tf)
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred))
print ('ROC-AUC Score - ', roc_auc_score(Y_test, Y_pred))

Accuracy Score -  0.9601990049751243
ROC-AUC Score -  0.6994856516484443


In [84]:
sample_reviews = df.sample(5, random_state=22)
sample_reviews_tf = tfidf.transform(sample_reviews['review'])
sentiment_predictions = model1.predict(sample_reviews_tf)
sentiment_predictions = pd.DataFrame(data = sentiment_predictions,
                                     index=sample_reviews.index,
                                     columns=['sentiment_prediction'])
sample_reviews = pd.concat([sample_reviews, sentiment_predictions], axis=1)
print ('Some sample reviews with their sentiment - ')
sample_reviews[['review','sentiment_prediction']]

Some sample reviews with their sentiment - 


Unnamed: 0,review,sentiment_prediction
5399,Everything I hoped for Perfect car for my need...,1
377,We've driven several Camrys for the past 20 ye...,1
4123,Most reliable car! It drives so smooth. Love i...,1
5107,I like the vehicle. You should not be able to ...,1
471,Very satisfied with this car and plan to keep ...,1


In [86]:
#df 데이터 프레임 전체를 감성분석 진행
df['sentiment_prediction'] = model1.predict(tfidf.transform(df['review']))

In [88]:
#엑셀 파일로 내보내기
df.to_excel("sentiment_1.xlsx")

In [90]:
# 감성분석 결과의 긍/부정 결과 비율 % 구하기
df['sentiment_prediction'].value_counts(normalize=True)

sentiment_prediction
1    0.950971
0    0.049029
Name: proportion, dtype: float64