- https://www.databricks.com/blog/introducing-english-new-programming-language-apache-spark
- https://github.com/databrickslabs/pyspark-ai/

In [0]:
from pyspark_ai import SparkAI
# langchain is auto installed as a dependency of pyspark-ai
from langchain.chat_models import ChatOpenAI
from pyspark.sql import functions as fn, types as T

In [0]:
# If 'gpt-4' is unavailable, use 'gpt-3.5-turbo' (might lower output quality)
llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)

# Initialize SparkAI with the ChatOpenAI model
spark_ai = SparkAI(llm=llm, verbose=True)

In [0]:
spark_ai.activate()

In [0]:
# a super useful help prompt
help(spark_ai)

Help on SparkAI in module pyspark_ai.pyspark_ai object:

class SparkAI(builtins.object)
 |  SparkAI(llm: Optional[langchain.base_language.BaseLanguageModel] = None, web_search_tool: Optional[Callable[[str], str]] = None, spark_session: Optional[pyspark.sql.session.SparkSession] = None, enable_cache: bool = True, cache_file_format: str = 'json', cache_file_location: Optional[str] = None, encoding: Optional[tiktoken.core.Encoding] = None, max_tokens_of_web_content: int = 3000, verbose: bool = False) -> None
 |  
 |  Methods defined here:
 |  
 |  __init__(self, llm: Optional[langchain.base_language.BaseLanguageModel] = None, web_search_tool: Optional[Callable[[str], str]] = None, spark_session: Optional[pyspark.sql.session.SparkSession] = None, enable_cache: bool = True, cache_file_format: str = 'json', cache_file_location: Optional[str] = None, encoding: Optional[tiktoken.core.Encoding] = None, max_tokens_of_web_content: int = 3000, verbose: bool = False) -> None
 |      Initialize the 

# Plotting Data

This can be done either by passing a dataframe, or by also providing some instructions to indicate the type of plot required. Note that usefully, a wireframe design of how the data was prepared for plotting is provided

In [0]:
# lets use a builtin databricks dataset
display(dbutils.fs.ls('/databricks-datasets/wine-quality/'))

path,name,size,modificationTime
dbfs:/databricks-datasets/wine-quality/README.md,README.md,1066,1594264539000
dbfs:/databricks-datasets/wine-quality/winequality-red.csv,winequality-red.csv,84199,1594264539000
dbfs:/databricks-datasets/wine-quality/winequality-white.csv,winequality-white.csv,264426,1594264539000


In [0]:
df_wine = (spark.read
  .format("csv")
  .option("header", "true")
  # this data has an unusual delimiter
  .option("delimiter", ";")
  .option("inferSchema", "true")
  .load("/databricks-datasets/wine-quality/winequality-red.csv")
)

In [0]:
df_wine.ai.plot()

[92mINFO: [0mTo visualize the result of a Spark DataFrame using Plotly, we need to convert the DataFrame to a Pandas DataFrame first. However, since the requirement is to not use any deprecated APIs, we cannot use the `toPandas()` method.

Instead, we can use the `collect()` method of the DataFrame to retrieve all the rows as a list of `Row` objects. Then, we can convert this list of `Row` objects to a list of dictionaries, where each dictionary represents a row with column names as keys and corresponding values.

Once we have the list of dictionaries, we can create a Pandas DataFrame using the `pd.DataFrame()` constructor. Finally, we can use Plotly to visualize the Pandas DataFrame.

Here's the code to achieve this:


```
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mplotly[39;49;00m[04m[36m.[39;49;00m[04m[36mexpress[39;49;00m [34mas[39;49;00m [04m[36mpx[39;49;00m[37m[39;4

In [0]:
df_wine.ai.plot('plot total sulphur dioxide against pH')

[92mINFO: [0mTo visualize the result of `df` using plotly, we can follow these steps:

1. Import the necessary libraries:

```
[34mimport[39;49;00m [04m[36mplotly[39;49;00m[04m[36m.[39;49;00m[04m[36mexpress[39;49;00m [34mas[39;49;00m [04m[36mpx[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mpyspark[39;49;00m[04m[36m.[39;49;00m[04m[36msql[39;49;00m[04m[36m.[39;49;00m[04m[36mfunctions[39;49;00m [34mas[39;49;00m [04m[36mF[39;49;00m[37m[39;49;00m
```

2. Create a new dataframe `plot_df` by selecting the required columns from `df`:

```
plot_df = df.select([33m"[39;49;00m[33mtotal sulfur dioxide[39;49;00m[33m"[39;49;00m, [33m"[39;49;00m[33mpH[39;49;00m[33m"[39;49;00m)[37m[39;49;00m
```

3. Convert the Spark dataframe to a Pandas dataframe using `toPandas()` method:

```
pandas_df = plot_df.toPandas()[37m[39;49;00m
```

4. Use Plotly Express to create a scatter plot:

```
fig = px.scatter(pandas_df, x=[33m"[39;49;00m[33mtot

# Transforming data

In [0]:
display(df_wine.limit(5))

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [0]:
display(df_wine.select('quality').distinct())

quality
6
3
5
4
8
7


In [0]:
df_averages = df_wine.ai.transform('group by pH and calculate the average chlorides and sulphate for each group, then order the results by ascending pH')

[92mINFO: [0mSQL query for the transform:
[34mSELECT[39;49;00m[37m [39;49;00mpH,[37m [39;49;00m[34mAVG[39;49;00m(chlorides)[37m [39;49;00m[34mas[39;49;00m[37m [39;49;00mavg_chlorides,[37m [39;49;00m[34mAVG[39;49;00m(sulphates)[37m [39;49;00m[34mas[39;49;00m[37m [39;49;00mavg_sulphates[37m[39;49;00m
[34mFROM[39;49;00m[37m [39;49;00mtemp_view_for_transform[37m[39;49;00m
[34mGROUP[39;49;00m[37m [39;49;00m[34mBY[39;49;00m[37m [39;49;00mpH[37m[39;49;00m
[34mORDER[39;49;00m[37m [39;49;00m[34mBY[39;49;00m[37m [39;49;00mpH[37m [39;49;00m[34mASC[39;49;00m[37m[39;49;00m



In [0]:
display(df_averages)

pH,avg_chlorides,avg_sulphates
2.74,0.61,2.0
2.86,0.075,0.79
2.87,0.148,1.36
2.88,0.078,0.645
2.89,0.0775,0.47
2.9,0.414,1.33
2.92,0.0895,0.725
2.93,0.11,1.96
2.94,0.082,0.98
2.95,0.1,0.68


In [0]:
# lets check the math
display(
  df_wine.groupBy(
    'pH'
  ).agg(
    fn.avg('chlorides'),
    fn.avg('sulphates')
  ).orderBy(
    fn.col('ph').asc()
  )
)
# looks like our AI calculated result is correct

pH,avg(chlorides),avg(sulphates)
2.74,0.61,2.0
2.86,0.075,0.79
2.87,0.148,1.36
2.88,0.078,0.645
2.89,0.0775,0.47
2.9,0.414,1.33
2.92,0.0895,0.725
2.93,0.11,1.96
2.94,0.082,0.98
2.95,0.1,0.68


## Failure Case

The SQL generated fails to correctly handle a column name with a space in it

In [0]:
df_averages_2 = df_wine.ai.transform('group by "residual sugar" and calculate the average chlorides, sulphates and pH for each group')

[92mINFO: [0mSQL query for the transform:
[34mSELECT[39;49;00m[37m [39;49;00m
[37m  [39;49;00mresidual[37m [39;49;00msugar,[37m[39;49;00m
[37m  [39;49;00m[34mAVG[39;49;00m(chlorides)[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00mavg_chlorides,[37m[39;49;00m
[37m  [39;49;00m[34mAVG[39;49;00m(sulphates)[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00mavg_sulphates,[37m[39;49;00m
[37m  [39;49;00m[34mAVG[39;49;00m(pH)[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00mavg_pH[37m[39;49;00m
[34mFROM[39;49;00m[37m [39;49;00m
[37m  [39;49;00mtemp_view_for_transform[37m[39;49;00m
[34mGROUP[39;49;00m[37m [39;49;00m[34mBY[39;49;00m[37m [39;49;00m
[37m  [39;49;00mresidual[37m [39;49;00msugar[37m[39;49;00m



[0;31m---------------------------------------------------------------------------[0m
[0;31mParseException[0m                            Traceback (most recent call last)
File [0;32m<command-3312646010236191>:1[0m
[0;32m----> 1[0m df_averages [38;5;241m=[39m [43mdf_wine[49m[38;5;241;43m.[39;49m[43mai[49m[38;5;241;43m.[39;49m[43mtransform[49m[43m([49m[38;5;124;43m'[39;49m[38;5;124;43mgroup by [39;49m[38;5;124;43m"[39;49m[38;5;124;43mresidual sugar[39;49m[38;5;124;43m"[39;49m[38;5;124;43m and calculate the average chlorides, sulphates and pH for each group[39;49m[38;5;124;43m'[39;49m[43m)[49m

File [0;32m/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/pyspark_ai/ai_utils.py:37[0m, in [0;36mAIMethodWrapper.transform[0;34m(self, desc, cache)[0m
[1;32m     24[0m [38;5;28;01mdef[39;00m [38;5;21mtransform[39m([38;5;28mself[39m, desc: [38;5;28mstr[39m, cache: [38;5;28mbool[39m [38;5;241m=[39m [38;5;28;01

# Creating UDFs

UDFS can be created with the spark_ai.udf decorator

In [0]:
@spark_ai.udf
def convert_quality(quality: int) -> str:
    """Convert the quality to a descriptive word where 1 would be awful and 10 would be amazing"""

[92mINFO: [0mCreating following Python UDF:
[34mdef[39;49;00m [32mconvert_quality[39;49;00m(quality) -> [36mstr[39;49;00m:[37m[39;49;00m
    [34mif[39;49;00m quality [35mis[39;49;00m [35mnot[39;49;00m [34mNone[39;49;00m:[37m[39;49;00m
        [34mif[39;49;00m quality == [34m1[39;49;00m:[37m[39;49;00m
            [34mreturn[39;49;00m [33m"[39;49;00m[33mawful[39;49;00m[33m"[39;49;00m[37m[39;49;00m
        [34melif[39;49;00m quality == [34m2[39;49;00m:[37m[39;49;00m
            [34mreturn[39;49;00m [33m"[39;49;00m[33mbad[39;49;00m[33m"[39;49;00m[37m[39;49;00m
        [34melif[39;49;00m quality == [34m3[39;49;00m:[37m[39;49;00m
            [34mreturn[39;49;00m [33m"[39;49;00m[33mpoor[39;49;00m[33m"[39;49;00m[37m[39;49;00m
        [34melif[39;49;00m quality == [34m4[39;49;00m:[37m[39;49;00m
            [34mreturn[39;49;00m [33m"[39;49;00m[33mfair[39;49;00m[33m"[39;49;00m[37m[39;49;00m
        [34melif[39;4

In [0]:
df_wine.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [0]:
type(convert_quality)

function

In [0]:
convert_quality(2)

'bad'

In [0]:
# strangely the @spark_ai.udf decorator seems to produce a standard function rather than a udf function so the following fails
df_wine_descriptions = df_wine.withColumn(
  'description', convert_quality('quality')
)

In [0]:
# however that is easy to rectify by adding a udf wrapper to the function
df_wine_descriptions = df_wine.withColumn(
  'description', fn.udf(convert_quality)('quality')
)

In [0]:
display(df_wine_descriptions.select('quality', 'description'))

quality,description
5,average
5,average
5,average
6,good
5,average
5,average
5,average
7,very good
7,very good
5,average


# Reading data from the web 

This can be done either by providing a search term (if you have set GOOGLE_API_KEY in your environment variables) or by providing a hypertext link. Optionally you can provide a list of column names to assist the AI in parsing the data

In [0]:
df_auto = spark_ai.create_df("2022 USA national auto sales by brand")

[0;31m---------------------------------------------------------------------------[0m
[0;31mValidationError[0m                           Traceback (most recent call last)
File [0;32m<command-3312646010236185>:1[0m
[0;32m----> 1[0m auto_df [38;5;241m=[39m [43mspark_ai[49m[38;5;241;43m.[39;49m[43mcreate_df[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43m2022 USA national auto sales by brand[39;49m[38;5;124;43m"[39;49m[43m)[49m

File [0;32m/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/pyspark_ai/pyspark_ai.py:293[0m, in [0;36mSparkAI.create_df[0;34m(self, desc, columns, cache)[0m
[1;32m    291[0m [38;5;66;03m# If the input is not a valid URL, use search tool to get the dataset.[39;00m
[1;32m    292[0m [38;5;28;01mif[39;00m [38;5;129;01mnot[39;00m is_url:
[0;32m--> 293[0m     url [38;5;241m=[39m [38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43m_get_url_from_search_tool[49m[43m([49m[43mdesc[49m[43m,[49m

In [0]:
df_auto = spark_ai.create_df("https://www.carpro.com/blog/full-year-2022-national-auto-sales-by-brand")

[92mINFO: [0mParsing URL: https://www.carpro.com/blog/full-year-2022-national-auto-sales-by-brand

[92mINFO: [0mSQL query for the ingestion:
[34mCREATE[39;49;00m[37m [39;49;00m[34mOR[39;49;00m[37m [39;49;00m[34mREPLACE[39;49;00m[37m [39;49;00mTEMP[37m [39;49;00m[34mVIEW[39;49;00m[37m [39;49;00mauto_sales[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00m
[34mSELECT[39;49;00m[37m [39;49;00m
[37m    [39;49;00mbrand[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00mbrand_name,[37m[39;49;00m
[37m    [39;49;00msales_2022[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00msales_2022,[37m[39;49;00m
[37m    [39;49;00msales_vs_2021[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00msales_vs_2021[37m[39;49;00m
[34mFROM[39;49;00m[37m [39;49;00m
[37m    [39;49;00m([34mVALUES[39;49;00m[37m [39;49;00m
[37m        [39;49;00m([33m'Toyota'[39;49;00m,[37m [39;49;00m[34m1849751[39;49;00m,[37m [39;49;00m-[34m9[39;49;00m),[37m[39;49;00m
[37m 

In [0]:
display(df_auto)

brand_name,sales_2022,sales_vs_2021
Toyota,1849751,-9
Ford,1767439,-2
Chevrolet,1502389,6
Honda,881201,-33
Hyundai,724265,-2
Kia,693549,-1
Jeep,684612,-12
Nissan,682731,-25
Subaru,556581,-5
Ram Trucks,545194,-16


## Failure Cases

The AI is still quite brittle. As such there can be failure cases when trying to read from the web

In [0]:
# a failure case where the data is not recognised
# df_40K = spark_ai.create_df("https://40kstats.goonhammer.com/#home")

[92mINFO: [0mParsing URL: https://40kstats.goonhammer.com/#home

[92mINFO: [0mSQL query for the ingestion:
[34mCREATE[39;49;00m[37m [39;49;00m[34mOR[39;49;00m[37m [39;49;00m[34mREPLACE[39;49;00m[37m [39;49;00mTEMP[37m [39;49;00m[34mVIEW[39;49;00m[37m [39;49;00mwarhammer_stats[37m [39;49;00m[34mAS[39;49;00m[37m[39;49;00m
[34mSELECT[39;49;00m[37m [39;49;00m[33m'Warhammer Stats Administratum TTBattles App 40k Stats Goonhammer Media The UTC Buttscribe Core Games Other Games Columns Gaming Tactics Reviews Hobby Site News Win Rate Win Rate and Points by Faction Win Rate by Faction/Mission Win Rate by Detachment Win Rate: Faction vs Faction Scoring Primary Scoring (All) Primary Scoring (Faction) Secondary Scoring (All) Secondary Scoring (Faction) Fixed vs Tactical Scoring Visualizations Faction Popularity Events Event Placings and Lists Top Finishes by Faction TiWP Report Faction Pages Mission Pack: Start Date: End Date: Follow us Facebook Instagram Twitter Yo

In [0]:
display(df_40K)

answer
"Warhammer Stats Administratum TTBattles App 40k Stats Goonhammer Media The UTC Buttscribe Core Games Other Games Columns Gaming Tactics Reviews Hobby Site News Win Rate Win Rate and Points by Faction Win Rate by Faction/Mission Win Rate by Detachment Win Rate: Faction vs Faction Scoring Primary Scoring (All) Primary Scoring (Faction) Secondary Scoring (All) Secondary Scoring (Faction) Fixed vs Tactical Scoring Visualizations Faction Popularity Events Event Placings and Lists Top Finishes by Faction TiWP Report Faction Pages Mission Pack: Start Date: End Date: Follow us Facebook Instagram Twitter YouTube Are you a Goonhammer Patron? Click here to disable ads Manage your cookie settings 40kstats.com is a presentation of Goonhammer. Copyright Goonhammer, LLC Title Close"


In [0]:
# a failure case where the data is not recognised even with column prompts
df_40K_rates = spark_ai.create_df(
  "https://40kstats.goonhammer.com/#GbF", 
  columns=["Faction",	"Games",	"VP",	"Opp VP",	"Win %",	"Wins",	"Losses",	"Draws",	"Real Win %"]
)

[92mINFO: [0mParsing URL: https://40kstats.goonhammer.com/#GbF

[92mINFO: [0mSQL query for the ingestion:
Based[37m [39;49;00m[34mon[39;49;00m[37m [39;49;00mthe[37m [39;49;00mgiven[37m [39;49;00minformation,[37m [39;49;00mit[37m [39;49;00mseems[37m [39;49;00mthat[37m [39;49;00mthe[37m [39;49;00mrequired[37m [39;49;00m[34mdata[39;49;00m[37m [39;49;00m[34mis[39;49;00m[37m [39;49;00m[34mnot[39;49;00m[37m [39;49;00mavailable[37m [39;49;00m[34min[39;49;00m[37m [39;49;00mthe[37m [39;49;00mprovided[37m [39;49;00manswer.[37m [39;49;00mThe[37m [39;49;00manswer[37m [39;49;00m[34monly[39;49;00m[37m [39;49;00mincludes[37m [39;49;00minformation[37m [39;49;00mabout[37m [39;49;00mvarious[37m [39;49;00m[34mstatistics[39;49;00m[37m [39;49;00m[34mand[39;49;00m[37m [39;49;00m[34moptions[39;49;00m[37m [39;49;00mrelated[37m [39;49;00m[34mto[39;49;00m[37m [39;49;00mWarhammer[37m [39;49;00mStats,[37m [39;49;00mbut[37m [

[0;31m---------------------------------------------------------------------------[0m
[0;31mValueError[0m                                Traceback (most recent call last)
File [0;32m<command-3312646010236179>:1[0m
[0;32m----> 1[0m df_40K_rates [38;5;241m=[39m [43mspark_ai[49m[38;5;241;43m.[39;49m[43mcreate_df[49m[43m([49m
[1;32m      2[0m [43m  [49m[38;5;124;43m"[39;49m[38;5;124;43mhttps://40kstats.goonhammer.com/#GbF[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m [49m
[1;32m      3[0m [43m  [49m[43mcolumns[49m[38;5;241;43m=[39;49m[43m[[49m[38;5;124;43m"[39;49m[38;5;124;43mFaction[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m	[49m[38;5;124;43m"[39;49m[38;5;124;43mGames[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m	[49m[38;5;124;43m"[39;49m[38;5;124;43mVP[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m	[49m[38;5;124;43m"[39;49m[38;5;124;43mOpp VP[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m	[49m[38;5;124;43m"[39;49m[38;5;124;43mWi

In [0]:
display(df_40K_rates)

answer
"Warhammer Stats Administratum TTBattles App 40k Stats Goonhammer Media The UTC Buttscribe Core Games Other Games Columns Gaming Tactics Reviews Hobby Site News Win Rate Win Rate and Points by Faction Win Rate by Faction/Mission Win Rate by Detachment Win Rate: Faction vs Faction Scoring Primary Scoring (All) Primary Scoring (Faction) Secondary Scoring (All) Secondary Scoring (Faction) Fixed vs Tactical Scoring Visualizations Faction Popularity Events Event Placings and Lists Top Finishes by Faction TiWP Report Faction Pages Mission Pack: Start Date: End Date: Follow us Facebook Instagram Twitter YouTube Are you a Goonhammer Patron? Click here to disable ads Manage your cookie settings 40kstats.com is a presentation of Goonhammer. Copyright Goonhammer, LLC Title Close"


In [0]:
# an interesting failure case where the data is ingested but fails to be converted to a dataframe, possibly due to a character limit
df_pop = spark_ai.create_df(
  "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population", 
  columns=["Country / Dependency", "Population",	"% of world",	"Date"	"Source (official or from the United Nations)"]
)

[92mINFO: [0mParsing URL: https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population

[92mINFO: [0mSQL query for the ingestion:
[34mCREATE[39;49;00m[37m [39;49;00m[34mOR[39;49;00m[37m [39;49;00m[34mREPLACE[39;49;00m[37m [39;49;00mTEMP[37m [39;49;00m[34mVIEW[39;49;00m[37m [39;49;00mcountry_population[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00m
[34mSELECT[39;49;00m[37m [39;49;00m
[37m    [39;49;00m[33m"Country / Dependency"[39;49;00m[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00mcountry,[37m[39;49;00m
[37m    [39;49;00m[33m"Population"[39;49;00m[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00mpopulation,[37m[39;49;00m
[37m    [39;49;00m[33m"% ofworld"[39;49;00m[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00mpercentage_of_world,[37m[39;49;00m
[37m    [39;49;00m[33m"Date Source (official or from the United Nations)"[39;49;00m[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00mdata_source[37m[39;49;00m

[0;31m---------------------------------------------------------------------------[0m
[0;31mParseException[0m                            Traceback (most recent call last)
File [0;32m<command-3312646010236181>:1[0m
[0;32m----> 1[0m df_gdp [38;5;241m=[39m [43mspark_ai[49m[38;5;241;43m.[39;49m[43mcreate_df[49m[43m([49m
[1;32m      2[0m [43m  [49m[38;5;124;43m"[39;49m[38;5;124;43mhttps://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m [49m
[1;32m      3[0m [43m  [49m[43mcolumns[49m[38;5;241;43m=[39;49m[43m[[49m[38;5;124;43m"[39;49m[38;5;124;43mCountry / Dependency[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m [49m[38;5;124;43m"[39;49m[38;5;124;43mPopulation[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m	[49m[38;5;124;43m"[39;49m[38;5;132;43;01m% o[39;49;00m[38;5;124;43mf world[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m	[49m[38;5;124;43m"[39;49m[38;5;124;43mDate[39