# **P1. Recogida datos no estructurados. Reviews**

**Enlace** al dataset de reviews en Kaggle: https://www.kaggle.com/datasets/jaidalmotra/movies-review 

In [1]:
!pip install requests beautifulsoup4 kagglehub



# Captura de reseñas

In [4]:
import kagglehub
import os
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("jaidalmotra/movies-review")

print("Path to dataset files:", path)

# List all files in the directory
path = os.path.join(path, os.listdir(path)[0])

# Create a dictionary to store DataFrames for each genre
genre_dfs = {}

# Read all CSV files
for file_name in os.listdir(path):
    df = pd.read_csv(os.path.join(path, file_name))

    # Extract the genre from the file name
    genre = file_name.split('.')[0]

    # Store the DataFrame in the dictionary
    genre_dfs[genre] = df

Path to dataset files: /home/jovyan/.cache/kagglehub/datasets/jaidalmotra/movies-review/versions/1


In [50]:
movies_df = pd.concat(genre_dfs.values(), ignore_index=True)
print("Total rows:", len(movies_df))
movies_df['id'] = movies_df['review_url'].str.extract(r'/title/(tt\d+)/')
movies_df = movies_df.drop_duplicates(subset='id').set_index('id')
# movies_df[movies_df['id'] == 'tt0034583']
print("Unique rows:", len(movies_df))

Total rows: 1700
Unique rows: 1150


In [51]:
movies_df.iloc[0]

Unnamed: 0,tt0114369
name,Se7en
year,1995
movie_rated,R
run_length,2h 7min
genres,Crime; Drama; Mystery;
release_date,22 September 1995 (USA)
rating,8.6
num_raters,1390231
num_reviews,1460
review_url,https://www.imdb.com/title/tt0114369/reviews/_...


In [52]:
movies_df.to_csv('movies.csv')

In [67]:
import requests
from bs4 import BeautifulSoup

def extract_imdb_reviews(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all review containers
        review_containers = soup.find_all('div', class_='review-container')

        reviews = []
        for container in review_containers:
            # Extract the review text
            review_text = container.find('div', class_='text show-more__control').text.strip()
            reviews.append(review_text)

        return reviews
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return []

reviews_data = []

# URL of the IMDB reviews page
for i, (movie_id, movie) in enumerate(movies_df.iterrows()):
  print("\rMovie count: ", i+1, end="")
  movie_name = movie['name']
  movie_url = movie['review_url']
  reviews = extract_imdb_reviews(movie_url)
  for review in reviews:
    reviews_data.append({'id': movie_id, 'name': movie_name, 'review': review})

# Convert the list of dictionaries into a DataFrame
reviews_df = pd.DataFrame(reviews_data)

# Set 'id' as the index
reviews_df.set_index('id', inplace=True)

reviews_df.head()

Movie count:  1150

Unnamed: 0_level_0,name,review
id,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0114369,Se7en,Detective Lt. William Somerset (Morgan Freeman...
tt0114369,Se7en,It is a rarity for a film to be completely uns...
tt0114369,Se7en,"The movie, ""Se7en"", starring Brad Pitt, Morgan..."
tt0114369,Se7en,After his calamitous experience working on his...
tt0114369,Se7en,"From David Fincher, Se7en is a compelling and ..."


In [70]:
reviews_df.to_csv('reviews.csv')

In [5]:
cleaned_df = pd.read_csv("./reviews.csv")

In [7]:
cleaned_df[0:5]

Unnamed: 0,id,name,review
0,tt0114369,Se7en,Detective Lt. William Somerset (Morgan Freeman...
1,tt0114369,Se7en,It is a rarity for a film to be completely uns...
2,tt0114369,Se7en,"The movie, ""Se7en"", starring Brad Pitt, Morgan..."
3,tt0114369,Se7en,After his calamitous experience working on his...
4,tt0114369,Se7en,"From David Fincher, Se7en is a compelling and ..."


## Inserción en ElasticSearch

In [8]:
import json
from elasticsearch import Elasticsearch
es = Elasticsearch ("http://elasticsearch:9200")

In [9]:
if not es.indices.exists(index="reviews"):
    es.indices.create(index="reviews")
    print("Índice reviews creado.")
else:
    print("El índice reviews ya existe.")


Índice reviews creado.


In [10]:
### Normalizacion

import re
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

cleaned_df['review'] = cleaned_df['review'].apply(clean_text)

In [11]:
from elasticsearch.helpers import bulk

docs = []
columns = cleaned_df.columns
for idx, row in cleaned_df.iterrows():
    # Crea el documento con los campos `id`, `name` y `review`
    tupla = {
        '_index': 'reviews',
        '_id': row[columns[0]],
        '_source': {
            columns[1]: row[columns[1]],  # `name`
            columns[2]: row[columns[2]]   # `review`
        }
    }
    
    docs.append(tupla)

# Verifica el contenido del segundo documento en la lista
docs[1]

{'_index': 'reviews',
 '_id': 'tt0114369',
 '_source': {'name': 'Se7en',
  'review': 'it is a rarity for a film to be completely unsettling and yet unrelentingly grippingdavid finchers story takes place in a bleak and constantly raining city never named where urban decay and sleaze in all forms are rampant coming up to his retirement from the police force is detective lieutenant somerset morgan freeman who is tasked with breaking in his replacement detective sergeant mills brad pitt before leaving somerset is world weary under no illusions about the futility of the daily role he plays and initially wants nothing more than to escape the grime and violence of the city mills on the other hand is convinced that he is going to make a real difference having voluntarily transferred to this precinct bringing his wife to the city with him before somerset can move on a homicide comes in which he and mills are assigned to investigate but its only the first of a string of ritual murders that will 

In [12]:
# Index the documents in bulk

res = bulk(es, docs)
print(res)

(28693, [])


In [None]:
### Ejemplo de consulta sobre el director M. Night Shyamalan

In [14]:
res = es.search(index='reviews', body={
    'query': {
    'match': { 'review': 'Shyamalan' }
    }
})
print(json.dumps(res.body, indent=4))

{
    "took": 14,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 4,
            "relation": "eq"
        },
        "max_score": 7.9168596,
        "hits": [
            {
                "_index": "reviews",
                "_type": "_doc",
                "_id": "tt0368447",
                "_score": 7.9168596,
                "_ignored": [
                    "review.keyword"
                ],
                "_source": {
                    "name": "The Village",
                    "review": "how does one write an intelligent coherent review of a movie that made me feel like i was not only cheated but done so at a shameless way m night shyamalan has for the last three out of five movies focused on making scifihorror films that rely on only one thing a fantastic twist at or near the movies climax or critical point which of course would make 