### Importing necessary libraries

In [68]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import requests 
from bs4 import BeautifulSoup 
import string  
import pickle
from nltk.corpus import stopwords 
import re 
from nltk.tokenize import RegexpTokenizer 
import nltk 
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.metrics import roc_curve, auc 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
from xgboost import XGBClassifier

### Web Scraping

In [2]:
url = 'https://www.airlinequality.com/airline-reviews/qantas-airways/'
response = requests.get(url)
html_content = response.text

In [3]:
response

<Response [200]>

In [4]:
html_content

'<!doctype html>\n\n<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7 lt-ie10" lang="en-GB"> <![endif]-->\n<!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8 lt-ie10" lang="en-GB"> <![endif]-->\n<!--[if IE 8]>    <html class="no-js lt-ie9 lt-ie10" lang="en-GB"> <![endif]-->\n<!--[if IE 9]>    <html class="no-js lt-ie10" lang="en-GB"> <![endif]-->\n<!--[if gt IE 8]><!-->\n<html lang="en-GB">\n<!--<![endif]-->\n\n<head>\n    <meta charset="utf-8">\n\n    <title>Qantas Airways Customer Reviews - SKYTRAX</title><link rel="preload" as="image" href="https://www.airlinequality.com/wp-content/themes/airlinequality2014new/library/images/nav/grad-header.jpg" fetchpriority="high">\n\n    <!-- Google Chrome Frame for IE -->\n    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n\n    <!-- mobile meta -->\n    <meta name="HandheldFriendly" content="True">\n    <meta name="MobileOptimized" content="320">\n    <meta name="viewport"\n        content="width=device-width, initial-s

#### Scraping Airline Reviews

In [61]:
def parse_star_rating(star_container):
    # Check for star ratings
    stars = star_container.find_all('span', class_='star')
    if stars:
        rating = sum(1 for star in stars if 'fill' in star['class'])
        return rating
    # Handle textual responses
    textual_rating = star_container.text.strip().lower()
    if textual_rating in ['yes', 'no']:
        return textual_rating
    return 'No rating'  # Default case if no stars or known text is found

def fetch_reviews(url):
    # Fetch the page
    response = requests.get(url)
    response.raise_for_status()  # This will raise an error if the fetch fails

    # Parse the HTML
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all review articles
    reviews = soup.find_all('article', itemprop='review')
    all_reviews = []
    
    for review in reviews:
        review_details = {
            'title': review.find('h2', class_='text_header').text.strip() if review.find('h2', class_='text_header') else 'No title',
            'author': review.find('span', itemprop='name').text.strip() if review.find('span', itemprop='name') else 'Anonymous',
            'date': review.find('time', itemprop='datePublished')['datetime'] if review.find('time', itemprop='datePublished') else 'No date',
            'body': review.find('div', class_='text_content').text.strip() if review.find('div', class_='text_content') else 'No content',
            'rating': review.find('div', itemprop='reviewRating').find('span', itemprop='ratingValue').text if review.find('div', itemprop='reviewRating') else 'No rating'
        }

        # Parse ratings
        rating_table = review.find('table', class_='review-ratings')
        if rating_table:
            for row in rating_table.find_all('tr'):
                category = row.find('td', class_='review-rating-header').text.strip().replace(' ', '_').lower()
                stars = row.find('td', class_='review-rating-stars')
                if stars:
                    review_details[category] = parse_star_rating(stars)
                else:
                    # Handle rows where no star spans are found, might be textual
                    review_details[category] = row.find('td', class_='review-value').text.strip().lower()

        all_reviews.append(review_details)
    
    return all_reviews, soup

def main():
    base_url = 'https://www.airlinequality.com/airline-reviews/qantas-airways/page/'
    
    all_data = []
    current_page = 1
    
    while True:
        print(f"Scraping page {current_page}")
        url = f"{base_url}{current_page}/"
        reviews, soup = fetch_reviews(url)
        
        if reviews is None or not reviews:
            print("No more pages to scrape or page not found.")
            break
        all_data.extend(reviews)
        current_page += 1

    df = pd.DataFrame(all_data)
    return df

if __name__ == '__main__':
    airline_review_df = main()
    print(airline_review_df.head())
    

Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
Scraping page 12
Scraping page 13
Scraping page 14
Scraping page 15
Scraping page 16
Scraping page 17
Scraping page 18
Scraping page 19
Scraping page 20
Scraping page 21
Scraping page 22
Scraping page 23
Scraping page 24
Scraping page 25
Scraping page 26
Scraping page 27
Scraping page 28
Scraping page 29
Scraping page 30
Scraping page 31
Scraping page 32
Scraping page 33
Scraping page 34
Scraping page 35
Scraping page 36
Scraping page 37
Scraping page 38
Scraping page 39
Scraping page 40
Scraping page 41
Scraping page 42
Scraping page 43
Scraping page 44
Scraping page 45
Scraping page 46
Scraping page 47
Scraping page 48
Scraping page 49
Scraping page 50
Scraping page 51
Scraping page 52
Scraping page 53
Scraping page 54
Scraping page 55
Scraping page 56
Scraping page 57
Scraping page 58
Scraping page 59
Scrapi

#### Scraping Seat Reviews

In [70]:
def parse_star_rating(star_container):
    # Check for star ratings
    stars = star_container.find_all('span', class_='star')
    if stars:
        rating = sum(1 for star in stars if 'fill' in star['class'])
        return rating
    # Handle textual responses
    textual_rating = star_container.text.strip().lower()
    if textual_rating in ['yes', 'no']:
        return textual_rating
    return 'No rating'  # Default case if no stars or known text is found

def fetch_reviews(url):
    # Fetch the page
    response = requests.get(url)
    response.raise_for_status()  # This will raise an error if the fetch fails

    # Parse the HTML
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all review articles
    reviews = soup.find_all('article', itemprop='review')
    all_reviews = []
    
    for review in reviews:
        review_details = {
            'title': review.find('h2', class_='text_header').text.strip() if review.find('h2', class_='text_header') else 'No title',
            'author': review.find('span', itemprop='name').text.strip() if review.find('span', itemprop='name') else 'Anonymous',
            'date': review.find('time', itemprop='datePublished')['datetime'] if review.find('time', itemprop='datePublished') else 'No date',
            'body': review.find('div', class_='text_content').text.strip() if review.find('div', class_='text_content') else 'No content',
            'rating': review.find('div', itemprop='reviewRating').find('span', itemprop='ratingValue').text if review.find('div', itemprop='reviewRating') else 'No rating'
        }

        # Parse ratings
        rating_table = review.find('table', class_='review-ratings')
        if rating_table:
            for row in rating_table.find_all('tr'):
                category = row.find('td', class_='review-rating-header').text.strip().replace(' ', '_').lower()
                stars = row.find('td', class_='review-rating-stars')
                if stars:
                    review_details[category] = parse_star_rating(stars)
                else:
                    # Handle rows where no star spans are found, might be textual
                    review_details[category] = row.find('td', class_='review-value').text.strip().lower()

        all_reviews.append(review_details)
    
    return all_reviews, soup

def main():
    base_url = 'https://www.airlinequality.com/seat-reviews/qantas-airways/page/'
    
    all_data = []
    current_page = 1
    
    while True:
        print(f"Scraping page {current_page}")
        url = f"{base_url}{current_page}/"
        reviews, soup = fetch_reviews(url)
        
        if reviews is None or not reviews:
            print("No more pages to scrape or page not found.")
            break
        all_data.extend(reviews)
        current_page += 1

    df = pd.DataFrame(all_data)
    return df

if __name__ == '__main__':
    seat_review_df = main()
    print(seat_review_df.head())
    

Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
No more pages to scrape or page not found.
                                    title                author        date  \
0  "You are separated by a fixed divider”        Michael Kinder  2022-09-29   
1               "private and comfortable"               B Heale  2020-01-08   
2               "very comfortable for me"  Muhammad Tahir Hanif  2019-12-08   
3                  "seat was comfortable"         Alan Sargeant  2019-10-21   
4                "cheap out on breakfast"            Davy Adams  2019-10-20   

                                                body rating       seat_type  \
0  ✅ Trip Verified |  In September we took our fi...      2  business class   
1  ✅ Trip Verified |  The A380 has an extremely p...     10     first class   
2  ✅ Trip Verified |  Seat was nice and wide, ver...      9   econo

#### Scraping Lounge Reviews

In [78]:
def parse_star_rating(star_container):
    # Check for star ratings
    stars = star_container.find_all('span', class_='star')
    if stars:
        rating = sum(1 for star in stars if 'fill' in star['class'])
        return rating
    # Handle textual responses
    textual_rating = star_container.text.strip().lower()
    if textual_rating in ['yes', 'no']:
        return textual_rating
    return 'No rating'  # Default case if no stars or known text is found

def fetch_reviews(url):
    # Fetch the page
    response = requests.get(url)
    response.raise_for_status()  # This will raise an error if the fetch fails

    # Parse the HTML
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all review articles
    reviews = soup.find_all('article', itemprop='review')
    all_reviews = []
    
    for review in reviews:
        review_details = {
            'title': review.find('h2', class_='text_header').text.strip() if review.find('h2', class_='text_header') else 'No title',
            'author': review.find('span', itemprop='name').text.strip() if review.find('span', itemprop='name') else 'Anonymous',
            'date': review.find('time', itemprop='datePublished')['datetime'] if review.find('time', itemprop='datePublished') else 'No date',
            'body': review.find('div', class_='text_content').text.strip() if review.find('div', class_='text_content') else 'No content',
            'rating': review.find('div', itemprop='reviewRating').find('span', itemprop='ratingValue').text if review.find('div', itemprop='reviewRating') else 'No rating'
        }

        # Parse ratings
        rating_table = review.find('table', class_='review-ratings')
        if rating_table:
            for row in rating_table.find_all('tr'):
                category = row.find('td', class_='review-rating-header').text.strip().replace(' ', '_').lower()
                stars = row.find('td', class_='review-rating-stars')
                if stars:
                    review_details[category] = parse_star_rating(stars)
                else:
                    # Handle rows where no star spans are found, might be textual
                    review_details[category] = row.find('td', class_='review-value').text.strip().lower()

        all_reviews.append(review_details)
    
    return all_reviews, soup

def main():
    base_url = 'https://www.airlinequality.com/lounge-reviews/qantas-airways/page/'
    
    all_data = []
    current_page = 1
    
    while True:
        print(f"Scraping page {current_page}")
        url = f"{base_url}{current_page}/"
        reviews, soup = fetch_reviews(url)
        
        if reviews is None or not reviews:
            print("No more pages to scrape or page not found.")
            break
        all_data.extend(reviews)
        current_page += 1

    df = pd.DataFrame(all_data)
    return df

if __name__ == '__main__':
    lounge_review_df = main()
    print(lounge_review_df.head())
    

Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
Scraping page 12
Scraping page 13
Scraping page 14
Scraping page 15
Scraping page 16
Scraping page 17
Scraping page 18
Scraping page 19
Scraping page 20
Scraping page 21
Scraping page 22
Scraping page 23
Scraping page 24
Scraping page 25
Scraping page 26
Scraping page 27
No more pages to scrape or page not found.
                               title        author        date  \
0              "noticed no GF foods"  Irene Taylor  2024-05-30   
1       "The lounge was pretty grim"  Tony Maddern  2023-12-21   
2  "really let their standards slip"     M Kellett  2023-09-13   
3  "The lounge is absolutely filthy"      O Binder  2023-07-20   
4            "The lounge was filthy"  Ed Blackwell  2023-05-11   

                                                body rating  \
0  Not Verified |   I am gluten free. I went to

### Inspecting the dataframes

In [8]:
# remove at the end
airline_review_df = pd.read_csv('review_df.csv')
seat_review_df = pd.read_csv('seat_review_df.csv')
lounge_review_df = pd.read_csv('lounge_review_df.csv')

In [9]:
# remove at the end
airline_review_df.drop(airline_review_df.columns[0], axis=1, inplace=True)
seat_review_df.drop(seat_review_df.columns[0], axis=1, inplace=True)
lounge_review_df.drop(lounge_review_df.columns[0], axis=1, inplace=True)

In [10]:
pd.set_option('display.max_columns', None)

In [11]:
airline_review_df.head()

Unnamed: 0,title,author,date,body,rating,aircraft,type_of_traveller,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_&_beverages,inflight_entertainment,ground_service,wifi_&_connectivity,value_for_money,recommended
0,"""Significant delays""",C Meade,2024-07-03,✅ Trip Verified | Significant delays with fir...,4,boeing 737-800,business,economy class,adelaide to sydney,june 2024,3.0,4.0,3.0,4.0,2.0,4.0,2.0,yes
1,"""food quality was abysmal""",K Jasmine,2024-06-29,Not Verified | I recently flew with Qantas a...,2,,solo leisure,economy class,melbourne to sydney,april 2024,1.0,1.0,,,1.0,,2.0,no
2,"""wasted money on a business ticket""",T Bayne,2024-06-25,✅ Trip Verified | My husband and I went to Au...,2,boeing 737,couple leisure,business class,auckland to adelaide via melbourne,june 2024,1.0,4.0,3.0,1.0,2.0,1.0,1.0,no
3,"""think about the food needs""",W Neale,2024-06-20,✅ Trip Verified | I travelled with my husband...,4,,business,business class,sydney to dili via darwin,june 2024,2.0,3.0,2.0,2.0,2.0,2.0,2.0,no
4,"""Another horrific flight""",Robin Esdaile,2024-06-07,✅ Trip Verified | Another horrific flight end...,3,a380,business,economy class,sydney to london via singapore,june 2024,3.0,2.0,1.0,3.0,2.0,1.0,2.0,no


In [12]:
seat_review_df.head()

Unnamed: 0,title,author,date,body,rating,seat_type,aircraft_type,seat_layout,date_flown,type_of_traveller,sleep_comfort,sitting_comfort,seat/bed_width,seat/bed_length,seat_privacy,power_supply,seat_storage,recommended,seat_legroom,seat_recline,seat_width,aisle_space,viewing_tv_screen
0,"""You are separated by a fixed divider”",Michael Kinder,2022-09-29,✅ Trip Verified | In September we took our fi...,2,business class,a330-300,1x2x1,september 2022,leisure,3.0,1.0,3.0,5.0,1.0,4.0,4.0,no,,,,,
1,"""private and comfortable""",B Heale,2020-01-08,✅ Trip Verified | The A380 has an extremely p...,10,first class,a380,1x1x1,january 2020,business,5.0,5.0,5.0,5.0,5.0,5.0,5.0,yes,,,,,
2,"""very comfortable for me""",Muhammad Tahir Hanif,2019-12-08,"✅ Trip Verified | Seat was nice and wide, ver...",9,economy class,boeing 787 / a330,3x3x3 / 2x4x2,november 2019,solo leisure,,,,,,5.0,5.0,yes,5.0,4.0,4.0,4.0,5.0
3,"""seat was comfortable""",Alan Sargeant,2019-10-21,✅ Trip Verified | The A330-300 business class...,9,business class,a330-300,1x2x1,august 2019,leisure,5.0,5.0,5.0,5.0,4.0,,4.0,yes,,,,,
4,"""cheap out on breakfast""",Davy Adams,2019-10-20,✅ Trip Verified | Row 6 is sandwiched between...,5,business class,a330-200,1x2x1,october 2019,leisure,3.0,5.0,4.0,5.0,1.0,5.0,5.0,no,,,,,


In [13]:
lounge_review_df.head()

Unnamed: 0,title,author,date,body,rating,airport,type_of_lounge,date_visit,type_of_traveller,comfort,cleanliness,bar_&_beverages,catering,washrooms,wifi_connectivity,staff_service,recommended,lounge_name
0,"""noticed no GF foods""",Irene Taylor,2024-05-30,Not Verified | I am gluten free. I went to g...,5,vancouver airport,business class,may 2024,business,3,5,4.0,1.0,5.0,4.0,3.0,no,
1,"""The lounge was pretty grim""",Tony Maddern,2023-12-21,"✅ Trip Verified | The lounge was pretty grim,...",3,melbourne airport,frequent flyer,december 2023,,3,4,2.0,2.0,3.0,3.0,,yes,
2,"""really let their standards slip""",M Kellett,2023-09-13,Not Verified | Qantas have really let their s...,3,sydney airport,business class,august 2023,business,4,3,3.0,1.0,2.0,5.0,2.0,no,business class lounge
3,"""The lounge is absolutely filthy""",O Binder,2023-07-20,Not Verified | The lounge is absolutely filth...,2,sydney airport,business class,july 2023,business,2,1,,1.0,1.0,4.0,2.0,no,qantas international lounge
4,"""The lounge was filthy""",Ed Blackwell,2023-05-11,"Not Verified | Food was poor, and a lot of th...",3,sydney airport,business class,may 2023,business,2,1,2.0,2.0,1.0,,3.0,no,


### Getting it ready for Sentiment Analysis

In [14]:
selected_columns = ['title', 'body', 'rating']
reviews_df = airline_review_df[selected_columns]
reviews_df2 = seat_review_df[selected_columns]
reviews_df3 = lounge_review_df[selected_columns]

In [15]:
df = pd.concat([reviews_df, reviews_df2, reviews_df3], ignore_index=True)

In [16]:
df = df.rename(columns={'body': 'text'})

In [17]:
df.head()

Unnamed: 0,title,text,rating
0,"""Significant delays""",✅ Trip Verified | Significant delays with fir...,4
1,"""food quality was abysmal""",Not Verified | I recently flew with Qantas a...,2
2,"""wasted money on a business ticket""",✅ Trip Verified | My husband and I went to Au...,2
3,"""think about the food needs""",✅ Trip Verified | I travelled with my husband...,4
4,"""Another horrific flight""",✅ Trip Verified | Another horrific flight end...,3


In [18]:
df['rating'][df['rating']<=7]=0
df['rating'][df['rating']>7]=1

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['rating'][df['rating']<=7]=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rating'][df['rating']<=7]=0

In [19]:
df['rating'].unique()

array([0, 1])

In [20]:
data_pos = df[df['rating'] == 1]
data_neg = df[df['rating'] == 0]

In [21]:
len(data_pos)

868

In [22]:
len(data_neg)

1395

#### Making statement text in lower case

In [23]:
df['text']=df['text'].str.lower()

In [24]:
df['text'].tail()

2258    the ambience is active but quiet. from the tel...
2259    this lounge is immaculately maintained - as wa...
2260    the staff were attentive and friendly. as ther...
2261    sydney business lounge very spacious with nice...
2262    the renovated qantas club at perth internation...
Name: text, dtype: object

In [25]:
STOPWORDS = set(stopwords.words('english'))
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
df['text'] = df['text'].apply(lambda text: cleaning_stopwords(text))
df['text'].head()

0    ✅ trip verified | significant delays first ann...
1    verified | recently flew qantas thoroughly dis...
2    ✅ trip verified | husband went auckland air ne...
3    ✅ trip verified | travelled husband daughter 1...
4    ✅ trip verified | another horrific flight endu...
Name: text, dtype: object

#### Cleaning and removing punctuations

In [26]:
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

In [27]:
df['text']= df['text'].apply(lambda x: cleaning_punctuations(x))
df['text'].tail()

2258    ambience active quiet telephone call day askin...
2259    lounge immaculately maintained  stated previou...
2260    staff attentive friendly airside pharmacy sydn...
2261    sydney business lounge spacious nice views foo...
2262    renovated qantas club perth international vast...
Name: text, dtype: object

#### Cleaning and removing repeating characters

In [28]:
def cleaning_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

In [29]:
df['text'] = df['text'].apply(lambda x: cleaning_repeating_char(x))
df['text'].tail()

2258    ambience active quiet telephone cal day asking...
2259    lounge imaculately maintained stated previousl...
2260    staf atentive friendly airside pharmacy sydney...
2261    sydney busines lounge spacious nice views fod ...
2262    renovated qantas club perth international vast...
Name: text, dtype: object

#### Cleaning and removing numeric numbers

In [30]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)

In [31]:
df['text'] = df['text'].apply(lambda x: cleaning_numbers(x))
df['text'].tail()

2258    ambience active quiet telephone cal day asking...
2259    lounge imaculately maintained stated previousl...
2260    staf atentive friendly airside pharmacy sydney...
2261    sydney busines lounge spacious nice views fod ...
2262    renovated qantas club perth international vast...
Name: text, dtype: object

#### Getting tokenization of the text

In [32]:
tokenizer = RegexpTokenizer(r'\w+')
df['text'] = df['text'].apply(tokenizer.tokenize)

In [33]:
df['text'].head()

0    [trip, verified, significant, delays, first, a...
1    [verified, recently, flew, qantas, thoroughly,...
2    [trip, verified, husband, went, auckland, air,...
3    [trip, verified, traveled, husband, daughter, ...
4    [trip, verified, another, horific, flight, end...
Name: text, dtype: object

#### Applying Stemming

In [35]:
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data

df['text']= df['text'].apply(lambda x: stemming_on_text(x))

In [36]:
df['text'].head()

0    [trip, verified, significant, delays, first, a...
1    [verified, recently, flew, qantas, thoroughly,...
2    [trip, verified, husband, went, auckland, air,...
3    [trip, verified, traveled, husband, daughter, ...
4    [trip, verified, another, horific, flight, end...
Name: text, dtype: object

#### Applying Lemmatizer

In [37]:
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data

df['text'] = df['text'].apply(lambda x: lemmatizer_on_text(x))

In [38]:
df['text'].head()

0    [trip, verified, significant, delays, first, a...
1    [verified, recently, flew, qantas, thoroughly,...
2    [trip, verified, husband, went, auckland, air,...
3    [trip, verified, traveled, husband, daughter, ...
4    [trip, verified, another, horific, flight, end...
Name: text, dtype: object

#### Stripping unnecessary characters

In [55]:
# Remove leading and trailing square brackets
df['text'] = df['text'].str.strip('[]')

### Training a ML model

#### Preparing the input features for training

In [57]:
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

In [58]:
X = tfidf.fit_transform(df['text']).toarray()

In [59]:
X.shape

(2263, 3000)

In [61]:
y = df['rating'].values

In [63]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

#### Model Training

In [69]:
xgb = XGBClassifier(n_estimators=50,random_state=2)

In [70]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    
    return accuracy,precision

In [72]:
train_classifier(xgb,X_train,y_train,X_test,y_test)

(0.8079470198675497, 0.808641975308642)

In [73]:
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(xgb,open('model.pkl','wb'))