In [8]:
from google.cloud import language_v1
from google.oauth2 import service_account
import pandas as pd

# Path to your CSV file and service account key file
file_path = r'amazon_reviews_cleaned.csv'  # Use absolute path
key_path = r"C:\ML\neural-cirrus-437807-t9-29c137016ba4.json"

# Load the dataset
df = pd.read_csv(file_path)

# Strip column names to remove extra spaces or characters
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Check if the expected column 'content' exists
if 'content' not in df.columns:
    raise KeyError("'content' column not found in the dataset")

# Load credentials from the JSON file
credentials = service_account.Credentials.from_service_account_file(key_path)

# Initialize the Google NLP client with the credentials
client = language_v1.LanguageServiceClient(credentials=credentials)

# Function to analyze sentiment, entities, and other NLP features for a review
def analyze_review(text):
    if not text or pd.isna(text):
        return {
            'sentiment_score': None,
            'sentiment_magnitude': None,
            'entities': [],
            'entity_sentiments': [],
            'review_class': None,
            'positive_reasons': "",
            'negative_reasons': "",
            'syntax_tokens': [],
            'detected_language': None,
            'classifications': []
        }
    
    # Prepare document
    document = language_v1.Document(content=text, type_=language_v1.Document.Type.PLAIN_TEXT)
    
    # Get sentiment analysis
    sentiment_response = client.analyze_sentiment(request={'document': document})
    sentiment = sentiment_response.document_sentiment
    
    # Get sentence-level sentiment analysis
    positive_sentences = []
    negative_sentences = []
    for sentence in sentiment_response.sentences:
        sentence_sentiment = sentence.sentiment.score
        if sentence_sentiment >= 0:
            positive_sentences.append(sentence.text.content)
        else:
            negative_sentences.append(sentence.text.content)
    
    # Get entity sentiment analysis (entities and their sentiment)
    entity_sentiment_response = client.analyze_entity_sentiment(request={'document': document})
    entity_sentiments = []
    for entity in entity_sentiment_response.entities:
        entity_info = {
            'name': entity.name,
            'type': language_v1.Entity.Type(entity.type_).name,
            'salience': entity.salience,
            'sentiment_score': entity.sentiment.score,
            'sentiment_magnitude': entity.sentiment.magnitude
        }
        entity_sentiments.append(entity_info)

    # Get syntax analysis (extracting part-of-speech tokens)
    syntax_response = client.analyze_syntax(request={'document': document})
    syntax_tokens = []
    for token in syntax_response.tokens:
        token_info = {
            'text': token.text.content,
            'part_of_speech': language_v1.PartOfSpeech.Tag(token.part_of_speech.tag).name
        }
        syntax_tokens.append(token_info)

    # Detect language (useful if working with multilingual reviews)
    language_response = client.analyze_entities(request={'document': document})
    detected_language = language_response.language

    # Classify text (optional, if you have text classification enabled)
    try:
        classify_response = client.classify_text(request={'document': document})
        classifications = []
        for category in classify_response.categories:
            classifications.append({
                'category': category.name,
                'confidence': category.confidence
            })
    except Exception as e:
        classifications = []
    
    # Classify review as positive or negative based on the overall sentiment score
    review_class = 'Positive' if sentiment.score >= 0 else 'Negative'
    
    return {
        'sentiment_score': sentiment.score,
        'sentiment_magnitude': sentiment.magnitude,
        'entities': [entity['name'] for entity in entity_sentiments],
        'entity_sentiments': entity_sentiments,
        'review_class': review_class,
        'positive_reasons': " ".join(positive_sentences),
        'negative_reasons': " ".join(negative_sentences),
        'syntax_tokens': syntax_tokens,
        'detected_language': detected_language,
        'classifications': classifications
    }

# Add columns for sentiment score, entity sentiment, classification, language detection, syntax tokens, etc.
df['sentiment_score'] = None
df['sentiment_magnitude'] = None
df['entities'] = None
df['entity_sentiments'] = None
df['review_class'] = None
df['positive_reasons'] = None
df['negative_reasons'] = None
df['syntax_tokens'] = None
df['detected_language'] = None
df['classifications'] = None

# Analyze each review
for i, row in df.iterrows():
    review_text = row['content']  # Correct column name is 'content'
    analysis_result = analyze_review(review_text)
    
    df.at[i, 'sentiment_score'] = analysis_result['sentiment_score']
    df.at[i, 'sentiment_magnitude'] = analysis_result['sentiment_magnitude']
    df.at[i, 'entities'] = ", ".join(analysis_result['entities'])
    df.at[i, 'entity_sentiments'] = analysis_result['entity_sentiments']
    df.at[i, 'review_class'] = analysis_result['review_class']
    df.at[i, 'positive_reasons'] = analysis_result['positive_reasons']
    df.at[i, 'negative_reasons'] = analysis_result['negative_reasons']
    df.at[i, 'syntax_tokens'] = analysis_result['syntax_tokens']
    df.at[i, 'detected_language'] = analysis_result['detected_language']
    df.at[i, 'classifications'] = analysis_result['classifications']

# Save the updated dataset with all the NLP analysis
output_file = 'all_reviews_with_full_nlp_analysis.csv'
df.to_csv(output_file, index=False)

# Display the first few rows of the updated dataframe
print(df.head())


   unnamed:_0               name  review  \
0           0  Vladimiro Mascaro     3.0   
1           1           Lucy Loo     9.0   
2           2                 Da    21.0   
3           3        Vicki Study     1.0   
4           4      Jeffrey Bruce     5.0   

                                      date_old  rating company  \
0    Friday, October 11, 2024 at 06:25:37 p.m.       1  amazon   
1    Friday, October 11, 2024 at 09:57:32 a.m.       1  amazon   
2  Thursday, October 10, 2024 at 11:36:09 a.m.       5  amazon   
3    Friday, October 11, 2024 at 03:36:33 p.m.       1  amazon   
4  Thursday, October 10, 2024 at 05:09:02 p.m.       1  amazon   

          country                 date  \
0  United Kingdom  2024-10-01 18:25:37   
1  United Kingdom  2024-10-01 09:57:32   
2  United Kingdom  2024-10-01 11:36:09   
3  United Kingdom  2024-10-01 15:36:33   
4  United Kingdom  2024-10-01 17:09:02   

                                     topic  \
0            4 months of total incopete

In [7]:
df = pd.read_csv('amazon_reviews_cleaned.csv')
df.columns

Index(['Unnamed: 0', 'name', 'review', 'date_old', 'rating', 'company',
       'country', 'date', 'topic', 'content'],
      dtype='object')