In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import ks_2samp
from scipy.stats import mannwhitneyu
from scipy.stats import zscore
import plotly.express as px
from sklearn.feature_extraction.text import CountVectorizer
from superlinked import framework as sl
from datetime import timedelta, datetime


pd.set_option('display.max_columns', None)
data = pd.read_csv('TMDB_movie_dataset_v11.csv')

In [None]:
data.info()

In [None]:
data.head()

We can clean this data as first step and keep the most relevant part. Some features are not necessary for this analysis and we can make also some consideration or focus on the dataset, like focusing only on english movies. 

Clean dataset from unwanted variables

In [None]:
# variable status
data.status.value_counts(dropna=False)

I dont care about non released movies. Neither about adult movies

In [None]:
# variable adult
data.adult.value_counts(dropna=False)

In [None]:
# variable original language
data.original_language.value_counts(dropna=False, normalize=True)

Half of the movies are in english then it goes to 5% for french...

In [None]:
# variable spoken languages when original language is english
data[data.original_language == 'en'].spoken_languages.value_counts(dropna=False, normalize=True)

We have 80% of english original movies language speaking in english, let s assume we can get rid of this feature keep all movies in english.

In [None]:
# filter data
data = data[(data.original_language == 'en')&
            (data.status == 'Released')&
            (data.adult == False)&
            (data.release_date.notna())].drop(columns=['original_language', 'spoken_languages', 'status', 'adult', "backdrop_path", "homepage", "imdb_id", "poster_path"])


In [None]:
data.info()

# Does release date affects votes (count and average) ?

Indepandently to all others features, we want to know if the release date or moment in the year affect positively or negatively the vote

In [None]:
data[['vote_count', 'vote_average']].describe()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

votes = data[['vote_count', 'vote_average']].replace([np.inf, -np.inf], np.nan).dropna()

sns.histplot(votes, x="vote_count", bins=20, kde=True, log_scale=True, ax=axes[0])
axes[0].set_title("Vote Count")
axes[0].set_xlabel("Vote Count")
axes[0].set_ylabel("Frequency")

sns.histplot(votes, x="vote_average", bins=20, kde=True, log_scale=True, ax=axes[1])
axes[1].set_title("Vote Average")
axes[1].set_xlabel("Vote Average")
axes[1].set_ylabel("Frequency")

plt.tight_layout()
plt.show()

We see clearly that the two distribution look like power distribution skwed left then right.

In [None]:
# correlation with log transformation
np.log1p(data[['vote_count', 'vote_average']]).corr()

In [None]:
df_plot = np.log1p(data[data['vote_count'] > 0][['vote_count', 'vote_average']].dropna())

plt.figure(figsize=(10, 6))

# plot the relationshipe between vote count and vote average
sns.scatterplot(data=df_plot, x='vote_count', y='vote_average', alpha=0.3)

plt.xlabel('Vote Count (log scale)')
plt.ylabel('Vote Average')
plt.title('Vote Average vs. Vote Count (Log Scale)')
plt.tight_layout()
plt.show()

There is a trend defined. More a movie has votes more it seems to be appreciated by the public. In other words, people tend to vote for the movie they like.

In [None]:
# transform realease date to datetime
data['release_date'] = pd.to_datetime(data['release_date'], errors='coerce')

# add a column for the month of the release
data['release_month'] = data['release_date'].dt.month_name()

# add a column for the year of the release
data['release_year'] = data['release_date'].dt.year

# add a column for the day of the release
data['release_day'] = data['release_date'].dt.day_name()

In [None]:
# insert order in mobths and years
month_order = ['January', 'February', 'March', 'April', 'May', 'June',
               'July', 'August', 'September', 'October', 'November', 'December']
data['release_month'] = pd.Categorical(data['release_month'], categories=month_order, ordered=True)

days_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
data["release_day"] = pd.Categorical(data["release_day"], categories=days_order, ordered=True)

years = data.release_year
months = data.release_month
days = data.release_day


# plot distribution movies releases per years, months and days
fig, axes = plt.subplots(1, 3, figsize=(14, 5))

sns.histplot(years, bins=20, kde=True, log_scale=False, ax=axes[0])
axes[0].set_title("Distribution of Release Years")
axes[0].set_xlabel("Release Year")
axes[0].set_ylabel("Count")

sns.histplot(months, bins=20, kde=True, log_scale=False, ax=axes[1])
axes[1].set_title("Distribution of Release Months")
axes[1].set_xlabel("Release Month")
axes[1].set_ylabel("Count")

sns.histplot(days, bins=20, kde=True, log_scale=False, ax=axes[2])
axes[2].set_title("Distribution of Release Days")
axes[2].set_xlabel("Release Days")
axes[2].set_ylabel("Count")

plt.setp(axes[1].get_xticklabels(), rotation=45)
plt.setp(axes[2].get_xticklabels(), rotation=45)
plt.tight_layout()
plt.show()

We first of al see that there is more and more movies release along the years. This is denoted a new pattern of cinema studios and more and more investiment in the cinema industry. 

We have a peak of releases during the month of January - why ? 

We see here that the release of movies is quite uniform along the days but we see also a slight peak on Friday, this can be due to probably the weekend. 

try to explain this gap in January from release year distribution

In [None]:
# Split dataset
january_movies = data[data['release_month'] == 'January']
other_movies = data[data['release_month'] != 'January']

# Plot distributions of release_year
plt.figure(figsize=(10, 5))

sns.histplot(january_movies['release_year'], bins=30, color='blue', label='January', kde=True, stat='density')
sns.histplot(other_movies['release_year'], bins=30, color='orange', label='Other Months', kde=True, stat='density', alpha=0.6)

plt.title("Distribution of Release Year: January vs Other Months")
plt.xlabel("Release Year")
plt.ylabel("Density")
plt.legend()
plt.tight_layout()
plt.show()

# Extract the release years as arrays (drop NaNs)
x = january_movies['release_year'].dropna()
y = other_movies['release_year'].dropna()

# Perform the two-sample KS test
ks_stat, p_value = ks_2samp(x, y)

print(f"KS Statistic: {ks_stat:.4f}")
print(f"P-Value: {p_value:.4f}")

# Interpretation
if p_value < 0.05:
    print("✅ The distributions are significantly different (reject H0).")
else:
    print("🟡 The distributions are not significantly different (fail to reject H0).")

The KS statistic, or Kolmogorov-Smirnov statistic, is a measure used in the Kolmogorov-Smirnov test to assess the similarity between two probability distributions. It quantifies the maximum vertical distance between the cumulative distribution functions (CDFs) of the two distributions being compared. Essentially, it helps determine how different two distributions are by measuring the largest discrepancy between their cumulative probabilities. 

We reject the null hypothesis.

There is strong evidence that the distribution of release years in January is different from that in the other months.

This supports your hypothesis:
January is disproportionately populated with older movies, especially pre-2000 ones.



what would be the minimale year include ? to smooth the distribution on release month, to get it uniform

In [None]:
# Group by release year
year_counts = data.groupby('release_year').size()
january_counts = data[data['release_month'] == 'January'].groupby('release_year').size()

# Combine into one DataFrame
year_df = pd.DataFrame({
    'total': year_counts,
    'january': january_counts
}).fillna(0)

# Calculate % of January movies per year
year_df['jan_percent'] = 100 * year_df['january'] / year_df['total']

# Plot
fig, ax1 = plt.subplots(figsize=(12, 6))
year_df['jan_percent'].plot(ax=ax1, color='red', label='% January', linewidth=2)

# Labels
ax1.set_xlabel('Release Year')
ax1.set_ylabel('% Released in January')
ax1.set_title('Total Movies January Release Share')
fig.legend(loc='upper left')

plt.setp(ax1.get_xticklabels(), rotation=90)
plt.tight_layout()
plt.show()


Before 1980:

Extremely volatile and artificially high % of movies released in January — in some years it's over 50% or even 100%.

This strongly suggests default or placeholder dates, possibly due to missing metadata.


1980–1999:

Still elevated and noisy, but stabilizing.

January still over-represented in many years (30–40%).


2000 onward:

Much more stable.

January release share gradually declines below 20%, which feels realistic.


Let s take release_year>=1990 to include most of the modern hollywood cinema (John Doe, Tarantino, Die Hard 2, The matrix)

After research on the web apparently January month is called the dump month, it is the moment when studios release movies they are less confident in it. So let s investigate, using the popularity varibale.


In [None]:
# filter data
data = data[(data.release_year>=1990)&(data.release_year<=2022)]

In [None]:
# plot boxplot of popularity per month
plt.figure(figsize=(12, 6))
sns.boxplot(x='release_month', y='popularity', data=data)
plt.xticks(rotation=45)
plt.title('Distribution of Popularity by Release Month')
plt.xlabel('Release Month')
plt.ylabel('Popularity')
plt.tight_layout()
plt.show()

# apply Mann–Whitney U test
jan_pop = np.log1p(data[data['release_month'] == 'January']['popularity'])
other_pop = np.log1p(data[data['release_month'] != 'January']['popularity'])
print(f"January median popularity: {jan_pop.median():.2f}")
print(f"Other months median popularity: {other_pop.median():.2f}")
stat, p = mannwhitneyu(jan_pop, other_pop, alternative='less')  # one-sided test

print(f"Mann–Whitney U statistic: {stat:.2f}")
print(f"P-value: {p:.4f}")

if p < 0.05:
    print("✅ January movies have significantly lower popularity.")
else:
    print("🟡 No significant difference in popularity.")

The Mann-Whitney U test, also known as the Wilcoxon rank-sum test, is a non-parametric statistical test used to compare two independent groups. It determines if there's a statistically significant difference between the medians (or distributions) of the two groups, particularly when the data is not normally distributed or when the assumptions of parametric tests like the independent samples t-test are violated. 

Let s focus on how the time affect the vote count and vote average.

In [None]:
stats_years = data.groupby('release_year', observed=True).agg(
    avg_votes=('vote_count', 'mean'),
    avg_rating=('vote_average', 'mean'),
    n_movies=('id', 'count')
).reset_index()

stats_months = data.groupby('release_month', observed=True).agg(
    avg_votes=('vote_count', 'mean'),
    avg_rating=('vote_average', 'mean'),
    n_movies=('id', 'count')
).reset_index()

# Set plot style
# sns.set(style='whitegrid')

# Create 2x2 subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 8))

# Plot 1: Average vote count by year
sns.lineplot(data=stats_years, x='release_year', y='avg_votes', ax=axes[0, 0])
axes[0, 0].set_title('Avg Vote Count by Year')
axes[0, 0].set_xlabel('Release Year')
axes[0, 0].set_ylabel('Vote Count')

# Plot 2: Average vote rating by year
sns.lineplot(data=stats_years, x='release_year', y='avg_rating', ax=axes[0, 1])
axes[0, 1].set_title('Avg Vote Rating by Year')
axes[0, 1].set_xlabel('Release Year')
axes[0, 1].set_ylabel('Vote Average')

# Plot 3: Average vote count by month
sns.barplot(data=stats_months, x='release_month', y='avg_votes', ax=axes[1, 0])
axes[1, 0].set_title('Avg Vote Count by Month')
axes[1, 0].set_xlabel('Release Month')
axes[1, 0].set_ylabel('Vote Count')
axes[1, 0].tick_params(axis='x', rotation=45)

# Plot 4: Average vote rating by month
sns.barplot(data=stats_months, x='release_month', y='avg_rating', ax=axes[1, 1])
axes[1, 1].set_title('Avg Vote Rating by Month')
axes[1, 1].set_xlabel('Release Month')
axes[1, 1].set_ylabel('Vote Average')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

Top Left: Average Vote Count by Year

What we see:
1. From 1990–2015: relatively high and stable average vote counts (between ~55 and 75).
2. From 2016–2022: clear downward trend, especially steep after 2020.

Interpretation:
1. Newer movies haven’t had enough time to accumulate votes → **recency bias**.
2. This decline is not quality-driven, but a data maturity issue.

**Suggestion**: Consider filtering out the last 3–5 years for some analyses.


Top Right: Average Vote Rating by Year

What we see:
1. Fairly stable rating ~2.4–2.6 from 1995 to 2018.
2. Dramatic drop after 2019, especially in 2021–2023.

Interpretation:
1. Again, possibly a data artifact — newer movies might:
2. Not have enough votes to average out rating bias.
3. Attract early ratings from fans/critics only.
4. post-pandemic changes in audience behavior or production quality?


Bottom Left: Average Vote Count by Month

What we see:
1. January has the lowest vote count — massive drop.
2. Summer (June–August) and December have the highest average vote counts.

Interpretation:
1. Confirms the “dump month” theory for January.
2. Blockbuster-heavy months like July and December get more attention → more votes.


Bottom Right: Average Vote Rating by Month

What we see:
1. January again at the bottom (~1.8 avg), while September and March–April are higher (~2.6).
2. Middle months vary slightly but don’t diverge much.

Interpretation:
1. Further supports the dump-month effect — January films tend to be lower rated.
2. Late summer and early fall might include festival, Oscar-contending, or better-reviewed films.


In [None]:
# apply suggestion
data = data[data.release_year<=data.release_year.max()-5]

If the years and the months affect the organization of the cinema studio and the release of their movies, I should see also this behavior in the budget and in the revenue. Also, the genre and the runtime of the movie will be affected. I suppose. 

Let's analyze how the budget, revenue, and runtime are affected through the years and also across the months of the year. Also, how through times the genre of the movie changes. What kind of movie are more released than before? Or is there a specific genre for a specific month. 

# Analyze Time vs Budget, Revenue, Runtime and Genres

How do release timing (year, month) influence budget, revenue, runtime, and genre?

In [None]:
#remove rows with budget/revenue = 0 (if relevant)
data = data[(data['budget'] > 0) & (data['revenue'] > 0) & (data['runtime'] > 0)]

In [None]:
# Aggregate stats by year and month
stats_years = data.groupby('release_year', observed=True).agg(
    avg_budget=('budget', 'mean'),
    avg_revenue=('revenue', 'mean'),
    avg_runtime=('runtime', 'mean'),
    n_movies=('id', 'count')
).reset_index()

stats_months = data.groupby('release_month', observed=True).agg(
    avg_budget=('budget', 'mean'),
    avg_revenue=('revenue', 'mean'),
    avg_runtime=('runtime', 'mean'),
    n_movies=('id', 'count')
).reset_index()

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 8))

# By Year
sns.lineplot(data=stats_years, x='release_year', y='avg_budget', ax=axes[0, 0])
axes[0, 0].set_title('Avg Budget by Year')
sns.lineplot(data=stats_years, x='release_year', y='avg_revenue', ax=axes[0, 1])
axes[0, 1].set_title('Avg Revenue by Year')
sns.lineplot(data=stats_years, x='release_year', y='avg_runtime', ax=axes[0, 2])
axes[0, 2].set_title('Avg Runtime by Year')

# By Month
sns.barplot(data=stats_months, x='release_month', y='avg_budget', ax=axes[1, 0])
axes[1, 0].set_title('Avg Budget by Month')
axes[1, 0].tick_params(axis='x', rotation=45)

sns.barplot(data=stats_months, x='release_month', y='avg_revenue', ax=axes[1, 1])
axes[1, 1].set_title('Avg Revenue by Month')
axes[1, 1].tick_params(axis='x', rotation=45)

sns.barplot(data=stats_months, x='release_month', y='avg_runtime', ax=axes[1, 2])
axes[1, 2].set_title('Avg Runtime by Month')
axes[1, 2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Replace NaNs with an empty string to avoid errors
data['genres_clean'] = data['genres'].fillna('')

# Initialize the CountVectorizer
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '), binary=True)

# Transform into matrix with data_genres_train
data_genres_train = data[data.genres_clean!='']
genre_matrix = vectorizer.fit_transform(data_genres_train['genres_clean'])

# get feature names
genre_labels = vectorizer.get_feature_names_out()

# convert to pd df
genre_df = pd.DataFrame(genre_matrix.toarray(), columns=genre_labels, index=data_genres_train.index)

# concat
data_genres = pd.concat([data_genres_train, genre_df], axis=1)


In [None]:
# group by years and apply the sum
genre_by_year = data_genres.groupby('release_year')[genre_labels].sum()

# Normalize to proportions per month
genre_by_year = genre_by_year.div(genre_by_year.sum(axis=1), axis=0).reset_index()

# Melt into long format for Plotly
genre_melted = genre_by_year.melt(
    id_vars='release_year',
    value_vars=genre_labels,
    var_name='Genre',
    value_name='Proportion'
)

# Create the stacked bar chart
fig = px.bar(
    genre_melted,
    x='release_year',
    y='Proportion',
    color='Genre',
    title='Distribution Genres per Year',
    labels={'release_month': 'Year', 'Proportion': 'Proportion'}
)

fig.update_layout(barmode='stack', xaxis_tickangle=45, height=600)
fig.show()

In [None]:
# Group by month
genre_by_month = data_genres.groupby('release_month')[genre_labels].sum()

# Normalize to proportions per month
genre_by_month = genre_by_month.div(genre_by_month.sum(axis=1), axis=0).reset_index()

# Melt into long format for Plotly
genre_melted = genre_by_month.melt(
    id_vars='release_month',
    value_vars=genre_labels,
    var_name='Genre',
    value_name='Proportion'
)

# Create the stacked bar chart
fig = px.bar(
    genre_melted,
    x='release_month',
    y='Proportion',
    color='Genre',
    title='Distribution Genres per Month',
    category_orders={'release_month': month_order},
    labels={'release_month': 'Month', 'Proportion': 'Proportion'}
)

fig.update_layout(barmode='stack', xaxis_tickangle=45, height=600)
fig.show()

# SuperLinked

In [None]:

class MovieSchema(sl.Schema):
    id: sl.IdField
    title: sl.String | None
    rating: sl.Float | None
    release_date: sl.Timestamp | None
    runtime: sl.Integer | None
    overview: sl.String | None
    genres: sl.StringList | None
    keywords: sl.StringList | None
    
# class UserSchema(sl.Schema):
#     id: sl.IdField
#     preference_title: sl.String | None
#     preference_rating: sl.Float | None
#     preference_release_date: sl.Timestamp | None
#     preference_runtime: sl.Integer | None
#     preference_overview: sl.String | None
#     preference_genres: sl.StringList | None
#     preference_keywords: sl.StringList | None

In [None]:
# initialize schemas
movie = MovieSchema()
# user = UserSchema()

In [None]:
l = data.keywords.dropna().unique().tolist()
keywords = []
for e in l:
    keywords.extend(e.split(', '))
kw = list(set(keywords))

In [None]:
# define spaces
title_space = sl.TextSimilaritySpace(
    text=movie.title, 
    model="sentence-transformers/all-MiniLM-L6-v2"
)

rating_space = sl.NumberSpace(
    number=movie.rating, 
    mode=sl.Mode.MAXIMUM, 
    min_value=0.0, 
    max_value=10.0
)

release_date_space = sl.RecencySpace(
    timestamp=movie.release_date,
    period_time_list=[
        sl.PeriodTime(timedelta(days=4 * 365)),
        sl.PeriodTime(timedelta(days=8 * 365)),
        sl.PeriodTime(timedelta(days=16 * 365)),
        sl.PeriodTime(timedelta(days=32 * 365))
    ],
    negative_filter=-0.25
)

runtime_space = sl.NumberSpace(
    number=movie.runtime, 
    mode=sl.Mode.MAXIMUM, 
    min_value=0, 
    max_value=999,
)

overview_space = sl.TextSimilaritySpace(
    text=movie.overview, 
    model="sentence-transformers/all-MiniLM-L6-v2",
)

genres_space = sl.CategoricalSimilaritySpace(
    category_input=movie.genres, 
    categories=list(genre_labels), 
    uncategorized_as_category=False
)

keywords_space = sl.CategoricalSimilaritySpace(
    category_input=movie.keywords, 
    categories=kw,
    uncategorized_as_category=False
)

In [None]:
# create the index
movie_index = sl.Index(
    spaces = [
        title_space,
        rating_space,
        release_date_space,
        runtime_space,
        overview_space,
        genres_space,
        keywords_space
    ]
)

In [None]:
# parse data into schemas
movie_df_parser = sl.DataFrameParser(schema=movie, mapping={movie.rating: "vote_average"})
# user_df_parser = sl.JsonParser(schema=user)

In [None]:
# setup the source - In memory
source_movie: sl.InMemorySource = sl.InMemorySource(movie, parser=movie_df_parser)
# source_user: sl.InMemorySource = sl.InMemorySource(user, parser=user_df_parser)

# set up the executor 
executor: sl.InMemoryExecutor = sl.InMemoryExecutor(sources=[source_movie], indices=[movie_index])

# define the app
app: sl.InMemoryApp = executor.run()

In [None]:
# load the actual data into our system
source_movie.put([data[["id", "title", "vote_average", "release_date", "runtime", "overview", "genres", "keywords"]].dropna()])

# source_user.put(
#     [
#         {
#             "id": "user_1", 
#             "preference_title": None, 
#             "preference_rating":None, 
#             "preference_release_date": None,
#             "preference_runtime": None,
#             "preference_overview": "Hold up in a very securized bank",
#             "preference_genres": ["action, thriller", "psychologic"],
#             "preference_keywords": None
#         },
#         {
#             "id": "user_2", 
#             "preference_title": None, 
#             "preference_rating":None, 
#             "preference_release_date": None,
#             "preference_runtime": None,
#             "preference_overview": "Drama about heritage in noble family",
#             "preference_genres": None,
#             "preference_keywords": "adultery"
#         },
#         {
#             "id": "user_3", 
#             "preference_title": None, 
#             "preference_rating":None, 
#             "preference_release_date": None,
#             "preference_runtime": None,
#             "preference_overview": "Hold up in a very securized bank",
#             "preference_genres": ["action, thriller", "psychologic"],
#             "preference_keywords": None  
#         }
#     ]
# )

In [None]:
# query object
query = (
    sl.Query(
        movie_index,
        weights={
            title_space: sl.Param("title_space_weight"),
            rating_space: sl.Param("rating_space_weight"),
            release_date_space: sl.Param("release_date_space_weight"),
            runtime_space: sl.Param("runtime_space_weight"),
            overview_space: sl.Param("overview_space_weight"),
            genres_space: sl.Param("genres_space_weight"),
            keywords_space: sl.Param("keywords_space_weight")
        },
    )
    .find(movie)
    .similar(title_space, sl.Param("title_query_text"))
    .similar(overview_space, sl.Param("overview_query_text"))
    .similar(genres_space, sl.Param("genres_query_text"))
    .similar(keywords_space, sl.Param("keywords_query_text"))
    .select_all()
    .limit(sl.Param("limit"))
)

In [None]:
result = app.query(
    query,
    title_query_text="love",
    overview_query_text="A love story between a man and a woman",
    genres_query_text="drama comedy romantic",
    keywords_query_text="love, romance, comedy",
    title_space_weight=1,
    rating_space_weight=1,
    release_date_space_weight=1,
    runtime_space_weight=1,
    overview_space_weight=1,
    genres_space_weight=1,
    keywords_space_weight=1,
    limit=10,
)

df = sl.PandasConverter.to_pandas(result)
sl.PandasConverter.format_date_column(df, "release_date", "release_year", year_only=True)
df

In [None]:
# fill this with your API key - this will drive param extraction
import os
openai_config = sl.OpenAIClientConfig(api_key=os.environ["OPENAI_API_KEY"], model="gpt-4o")


In [None]:
nlq_query = query.with_natural_query(sl.Param("natural_query"), openai_config)


In [None]:
nlq_result = app.query(
    nlq_query,
    natural_query="Best action movies in the year 2001",
    limit=10,
)

df = sl.PandasConverter.to_pandas(nlq_result)
sl.PandasConverter.format_date_column(df, "release_date", "release_year", year_only=True)
df

In [None]:
context_items_from_retrieval: int = 5
context_text: str = (
    "\n"
    + "\n".join(
        f
        for f in sl.PandasConverter.to_pandas(normal_recency_usefulness_result)["body"].iloc[
            :context_items_from_retrieval
        ]
    )
    + "\n"
)

rag_query = f"""<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.
If you don't know the answer to a question, please don't share false information.
<</SYS>>

Please answer the following question by using information from the provided context information!
CONTEXT_INFORMATION: {context_text}
QUESTION: {initial_query_text}
[/INST]"""